Auto: opcuaclient-14 — ServerUriArray redundant failover

Closes #286
This commit is contained in:
Joseph Doherty
2026-04-26 10:05:05 -04:00
parent 35d733d73b
commit 705c98ad98
11 changed files with 1088 additions and 2 deletions

View File

@@ -44,6 +44,34 @@ services:
retries: 10
start_period: 10s
# opc-plc-secondary — second opc-plc instance for upstream-redundancy testing
# (PR-14, issue #286). Listens on a different port so it can run alongside the
# primary; the integration test suite drives a ServiceLevel drop on the primary
# and asserts the driver fails over onto the secondary's session. Both
# instances are independent — this isn't a real OPC UA redundant pair (there's
# no shared address space), but the failover-decision wiring is what we need
# to validate end-to-end.
opc-plc-secondary:
image: mcr.microsoft.com/iotedge/opc-plc:2.14.10
container_name: otopcua-opc-plc-secondary
restart: "no"
ports:
- "50002:50000"
command:
# Same flags as the primary so the test session-shape is identical. --pn
# stays at 50000 inside the container; the host-side port-map above puts
# it at 50002 for the test runner.
- "--pn=50000"
- "--ut"
- "--aa"
- "--alm"
healthcheck:
test: ["CMD-SHELL", "netstat -an | grep -q ':50000.*LISTEN' || exit 1"]
interval: 5s
timeout: 2s
retries: 10
start_period: 10s
# opc-plc-rc — reverse-connect (server-initiated) variant. The simulator
# acts as the OPC UA server but, unlike the regular service above, it dials
# OUT to the client's listener URL instead of accepting an inbound dial.

View File

@@ -0,0 +1,95 @@
using System.Net.Sockets;
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.IntegrationTests;
/// <summary>
/// Multi-endpoint fixture for upstream-redundancy smoke tests (PR-14, issue #286).
/// Probes both <c>opc-plc</c> instances from the docker-compose stack —
/// <c>opc-plc</c> on 50000 + <c>opc-plc-secondary</c> on 50002 — and exposes
/// a <see cref="SkipReason"/> when either is unreachable. Tests use the pair to
/// drive a ServiceLevel drop on the primary and assert the driver fails over
/// to the secondary mid-session.
/// </summary>
/// <remarks>
/// The primary endpoint URL can be overridden via <c>OPCUA_SIM_ENDPOINT</c> + the
/// secondary via <c>OPCUA_SIM_ENDPOINT_SECONDARY</c> for runs against real
/// redundant servers. Defaults assume the docker-compose stack is up locally
/// (<c>docker compose -f Docker/docker-compose.yml up opc-plc opc-plc-secondary</c>).
/// </remarks>
public sealed class OpcPlcRedundancyFixture : IAsyncDisposable
{
private const string DefaultPrimary = "opc.tcp://localhost:50000";
private const string DefaultSecondary = "opc.tcp://localhost:50002";
private const string PrimaryEnvVar = "OPCUA_SIM_ENDPOINT";
private const string SecondaryEnvVar = "OPCUA_SIM_ENDPOINT_SECONDARY";
public string PrimaryEndpointUrl { get; }
public string SecondaryEndpointUrl { get; }
public string? SkipReason { get; }
public OpcPlcRedundancyFixture()
{
PrimaryEndpointUrl = Environment.GetEnvironmentVariable(PrimaryEnvVar) ?? DefaultPrimary;
SecondaryEndpointUrl = Environment.GetEnvironmentVariable(SecondaryEnvVar) ?? DefaultSecondary;
if (!ProbeTcp(PrimaryEndpointUrl, out var primaryReason))
{
SkipReason = primaryReason;
return;
}
if (!ProbeTcp(SecondaryEndpointUrl, out var secondaryReason))
{
SkipReason = secondaryReason;
return;
}
}
private static bool ProbeTcp(string endpointUrl, out string? skipReason)
{
skipReason = null;
var (host, port) = ParseHostPort(endpointUrl);
try
{
using var client = new TcpClient(AddressFamily.InterNetwork);
var task = client.ConnectAsync(
System.Net.Dns.GetHostAddresses(host)
.FirstOrDefault(a => a.AddressFamily == AddressFamily.InterNetwork)
?? System.Net.IPAddress.Loopback,
port);
if (!task.Wait(TimeSpan.FromSeconds(2)) || !client.Connected)
{
skipReason = $"opc-plc instance at {host}:{port} did not accept a TCP connection within 2s. " +
"Start it (`docker compose -f Docker/docker-compose.yml up opc-plc opc-plc-secondary`).";
return false;
}
return true;
}
catch (Exception ex)
{
skipReason = $"opc-plc instance at {host}:{port} unreachable: {ex.GetType().Name}: {ex.Message}.";
return false;
}
}
private static (string Host, int Port) ParseHostPort(string endpointUrl)
{
const string scheme = "opc.tcp://";
var body = endpointUrl.StartsWith(scheme, StringComparison.OrdinalIgnoreCase)
? endpointUrl[scheme.Length..]
: endpointUrl;
var slash = body.IndexOf('/');
if (slash >= 0) body = body[..slash];
var colon = body.IndexOf(':');
if (colon < 0) return (body, 4840);
var host = body[..colon];
return int.TryParse(body[(colon + 1)..], out var p) ? (host, p) : (host, 4840);
}
public ValueTask DisposeAsync() => ValueTask.CompletedTask;
}
[Xunit.CollectionDefinition(Name)]
public sealed class OpcPlcRedundancyCollection : Xunit.ICollectionFixture<OpcPlcRedundancyFixture>
{
public const string Name = "OpcPlcRedundancy";
}

View File

@@ -0,0 +1,65 @@
using Shouldly;
using Xunit;
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.IntegrationTests;
/// <summary>
/// Upstream-redundancy smoke (PR-14, issue #286). Asserts the driver discovers
/// the upstream's redundant peer list, watches <c>ServiceLevel</c> via
/// subscription, and fails over onto the secondary when the primary's level
/// drops below threshold. Build-only by default — opc-plc doesn't expose a
/// ServiceLevel knob from the outside, so the smoke runs the discovery + initial
/// subscribe paths against the real simulator and uses the driver's test seam to
/// synthesize the drop.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why opc-plc isn't a "real" redundant pair</b>: each opc-plc instance is
/// independent — they don't federate ServerArray with each other. The smoke
/// test seeds the peer list manually (mirroring what the discovery pass would
/// find on a real redundant server) and asserts the failover-decision wiring
/// works end-to-end against two live SDK sessions. Wire-level coverage against
/// a real redundant server pair is an env-gated follow-up.
/// </para>
/// <para>
/// <b>Build-only gating</b>: when <see cref="OpcPlcRedundancyFixture.SkipReason"/>
/// is set the test calls <c>Assert.Skip</c> with the message; CI runs that don't
/// spin up the secondary container skip cleanly.
/// </para>
/// </remarks>
[Collection(OpcPlcRedundancyCollection.Name)]
[Trait("Category", "Integration")]
[Trait("Simulator", "opc-plc-redundant")]
public sealed class OpcUaClientRedundancySmokeTests(OpcPlcRedundancyFixture fx)
{
[Fact]
public async Task Driver_initializes_and_exposes_redundancy_diagnostics_against_live_pair()
{
if (fx.SkipReason is not null) Assert.Skip(fx.SkipReason);
var options = new OpcUaClientDriverOptions
{
EndpointUrls = [fx.PrimaryEndpointUrl, fx.SecondaryEndpointUrl],
SecurityPolicy = OpcUaSecurityPolicy.None,
SecurityMode = OpcUaSecurityMode.None,
AuthType = OpcUaAuthType.Anonymous,
AutoAcceptCertificates = true,
Timeout = TimeSpan.FromSeconds(15),
SessionTimeout = TimeSpan.FromSeconds(60),
Redundancy = new RedundancyOptions(
Enabled: true,
ServiceLevelThreshold: 200),
};
await using var drv = new OpcUaClientDriver(options, "opcua-redundancy-smoke");
await drv.InitializeAsync("{}", TestContext.Current.CancellationToken);
// Discovery is best-effort: opc-plc doesn't advertise itself in
// ServerUriArray, so _redundancyPeers may be empty after init. The diagnostic
// counters MUST be exposed regardless so operators see a stable surface.
var diags = drv.GetHealth().Diagnostics;
diags.ShouldNotBeNull();
diags!.ShouldContainKey("RedundancyFailoverCount");
diags.ShouldContainKey("RedundancyFailoverFailures");
}
}

View File

@@ -0,0 +1,278 @@
using System.Text.Json;
using Shouldly;
using Xunit;
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
/// <summary>
/// Unit tests for upstream-redundancy failover (PR-14, issue #286). The driver
/// exposes two test seams — <see cref="OpcUaClientDriver.InjectServiceLevelDropForTest"/>
/// and <see cref="OpcUaClientDriver.RedundancyFailoverHookForTest"/> — that bypass
/// the SDK's session-create + TransferSubscriptions machinery so we can assert the
/// decision logic without standing up two real OPC UA sessions.
/// </summary>
[Trait("Category", "Unit")]
public sealed class OpcUaClientRedundancyTests
{
[Fact]
public void Redundancy_options_default_to_disabled()
{
var opts = new OpcUaClientDriverOptions();
opts.Redundancy.ShouldNotBeNull();
opts.Redundancy.Enabled.ShouldBeFalse(
"default deployments do client-side failover via EndpointUrls; upstream redundancy is opt-in");
opts.Redundancy.ServiceLevelThreshold.ShouldBe((ushort)200,
"OPC UA spec convention: 200+ = healthy, lower = degraded");
opts.Redundancy.ResolvedRecheckInterval.ShouldBe(TimeSpan.FromSeconds(5));
}
[Fact]
public void DTO_json_round_trip_preserves_redundancy_settings()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(
Enabled: true,
ServiceLevelThreshold: 150,
RecheckInterval: TimeSpan.FromSeconds(10)),
};
var json = JsonSerializer.Serialize(opts);
var roundTripped = JsonSerializer.Deserialize<OpcUaClientDriverOptions>(json);
roundTripped.ShouldNotBeNull();
roundTripped!.Redundancy.Enabled.ShouldBeTrue();
roundTripped.Redundancy.ServiceLevelThreshold.ShouldBe((ushort)150);
roundTripped.Redundancy.ResolvedRecheckInterval.ShouldBe(TimeSpan.FromSeconds(10));
}
[Fact]
public void Disabled_redundancy_does_not_failover_on_low_servicelevel()
{
// Even a value of 0 (unrecoverable per the spec) should be a no-op when the
// feature is disabled — the driver shouldn't be reading ServerArray or watching
// ServiceLevel at all in that mode.
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: false),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-disabled");
var hookFired = false;
drv.RedundancyFailoverHookForTest = (_, _) => { hookFired = true; return Task.FromResult(true); };
drv.InjectServiceLevelDropForTest(0);
hookFired.ShouldBeFalse(
"Redundancy.Enabled=false means ServiceLevel drops must not trigger failover");
drv.RedundancyFailoverInvocationsForTest.ShouldBe(0);
}
[Fact]
public void ServiceLevel_above_threshold_does_not_trigger_failover()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true, ServiceLevelThreshold: 200),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-healthy");
SeedPeers(drv, "opc.tcp://secondary:4840");
var hookFired = false;
drv.RedundancyFailoverHookForTest = (_, _) => { hookFired = true; return Task.FromResult(true); };
// Equal to threshold = healthy boundary; spec semantics treat 200 as healthy.
drv.InjectServiceLevelDropForTest(200);
// Just above threshold = healthy.
drv.InjectServiceLevelDropForTest(220);
hookFired.ShouldBeFalse(
"ServiceLevel >= threshold must not trigger failover — healthy primary stays put");
}
[Fact]
public void ServiceLevel_below_threshold_triggers_failover_with_secondary_uri()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true, ServiceLevelThreshold: 200),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-failover");
SeedPeers(drv, "opc.tcp://primary:4840", "opc.tcp://secondary:4840");
SeedActive(drv, "opc.tcp://primary:4840");
string? failoverTarget = null;
drv.RedundancyFailoverHookForTest = (uri, _) =>
{
failoverTarget = uri;
return Task.FromResult(true);
};
drv.InjectServiceLevelDropForTest(50);
// Wait for the fire-and-forget Task to complete. The driver dispatches FailoverAsync
// via discard — give it a beat to land.
Wait(() => failoverTarget is not null);
failoverTarget.ShouldBe("opc.tcp://secondary:4840",
"the failover path picks the next URI in ServerArray that isn't the active one");
var diags1 = drv.GetHealth().Diagnostics;
diags1.ShouldNotBeNull();
diags1!.ShouldContainKey("RedundancyFailoverCount");
diags1["RedundancyFailoverCount"].ShouldBe(1);
}
[Fact]
public void Empty_peer_list_does_not_trigger_failover()
{
// Upstream with RedundancySupport=None (or one that simply doesn't expose the
// ServerUriArray node) leaves _redundancyPeers empty. ServiceLevel drops in that
// mode are diagnostic-only — the driver has no peer to swap to.
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-no-peers");
var hookFired = false;
drv.RedundancyFailoverHookForTest = (_, _) => { hookFired = true; return Task.FromResult(true); };
drv.InjectServiceLevelDropForTest(50);
hookFired.ShouldBeFalse(
"ServerArray empty means there's nowhere to fail over to — drop is informational only");
}
[Fact]
public void Failover_with_only_active_uri_in_peer_list_does_not_swap_to_self()
{
// Edge case: the upstream advertises itself in ServerUriArray but no actual peers.
// The driver must not try to fail over to the URI it's already on.
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-self-only");
SeedPeers(drv, "opc.tcp://primary:4840");
SeedActive(drv, "opc.tcp://primary:4840");
var hookFired = false;
drv.RedundancyFailoverHookForTest = (_, _) => { hookFired = true; return Task.FromResult(true); };
drv.InjectServiceLevelDropForTest(50);
hookFired.ShouldBeFalse(
"the only peer in the list is the active URI itself — there's nothing to swap to");
}
[Fact]
public void Failover_failure_increments_failures_counter_and_keeps_session()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-failure");
SeedPeers(drv, "opc.tcp://primary:4840", "opc.tcp://secondary:4840");
SeedActive(drv, "opc.tcp://primary:4840");
drv.RedundancyFailoverHookForTest = (_, _) => Task.FromResult(false);
drv.InjectServiceLevelDropForTest(50);
Wait(() => drv.GetHealth().Diagnostics is { } d
&& d.TryGetValue("RedundancyFailoverFailures", out var f) && f >= 1);
var diags = drv.GetHealth().Diagnostics;
diags.ShouldNotBeNull();
diags!.ShouldContainKey("RedundancyFailoverFailures");
diags["RedundancyFailoverFailures"].ShouldBe(1);
diags["RedundancyFailoverCount"].ShouldBe(0,
"a failed swap must not bump the success counter");
}
[Fact]
public void Repeated_drops_within_recheck_interval_only_failover_once()
{
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(
Enabled: true,
ServiceLevelThreshold: 200,
RecheckInterval: TimeSpan.FromMinutes(5)),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-debounce");
SeedPeers(drv, "opc.tcp://primary:4840", "opc.tcp://secondary:4840");
SeedActive(drv, "opc.tcp://primary:4840");
var calls = 0;
drv.RedundancyFailoverHookForTest = (_, _) =>
{
Interlocked.Increment(ref calls);
return Task.FromResult(true);
};
drv.InjectServiceLevelDropForTest(50);
Wait(() => calls >= 1);
drv.InjectServiceLevelDropForTest(50);
drv.InjectServiceLevelDropForTest(40);
// RecheckInterval = 5 minutes — the second + third drops should be suppressed
// because the first failover landed inside the window.
calls.ShouldBe(1,
"RecheckInterval suppresses oscillation around the threshold so a flapping primary doesn't ping-pong");
}
[Fact]
public void Diagnostics_exposes_redundancy_counters_in_snapshot()
{
// The `driver-diagnostics` RPC reads through GetHealth(); operators expect the
// redundancy counters in the snapshot regardless of whether failover ever fired.
var opts = new OpcUaClientDriverOptions
{
EndpointUrl = "opc.tcp://primary:4840",
Redundancy = new RedundancyOptions(Enabled: true),
};
using var drv = new OpcUaClientDriver(opts, "opcua-redundancy-diag");
var d = drv.GetHealth().Diagnostics;
d.ShouldNotBeNull();
d!.ShouldContainKey("RedundancyFailoverCount");
d.ShouldContainKey("RedundancyFailoverFailures");
d["RedundancyFailoverCount"].ShouldBe(0);
d["RedundancyFailoverFailures"].ShouldBe(0);
}
// ---- helpers ----
private static void SeedPeers(OpcUaClientDriver drv, params string[] peers)
{
// The driver normally populates _redundancyPeers from a session ReadValue call.
// For unit testing we use reflection to seed the field directly — the alternative
// (mocking ISession) brings most of the OPC UA SDK into the test surface.
var field = typeof(OpcUaClientDriver).GetField(
"_redundancyPeers",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!;
field.SetValue(drv, (IReadOnlyList<string>)peers);
}
private static void SeedActive(OpcUaClientDriver drv, string uri)
{
var diag = drv.DiagnosticsForTest;
diag.SetActiveServerUri(uri);
}
private static void Wait(Func<bool> predicate, int timeoutMs = 2000)
{
var deadline = DateTime.UtcNow.AddMilliseconds(timeoutMs);
while (DateTime.UtcNow < deadline)
{
if (predicate()) return;
Thread.Sleep(10);
}
}
}