Phase 6.3 Stream B — peer-probe HostedServices populating PeerReachabilityTracker

Closes task #116 (GA hardening backlog). Before this commit the
RedundancyStatePublisher saw PeerReachability.Unknown for every peer
because the tracker had no writers — every healthy peer got
degraded to the Isolated-Primary band (230) even when fully reachable.
Not release-blocking (safe default), but not the full non-transparent-
redundancy UX either.

Two-layer probe model per docs/v2/implementation/phase-6-3-redundancy-runtime.md
§Stream B:

- PeerHttpProbeLoop (Stream B.1) — fast-fail layer at 2 s / 1 s timeout.
  Hits each peer's http://{Host}:{DashboardPort}/healthz via an injected
  IHttpClientFactory. Writes the HTTP bit of PeerReachability while
  preserving the UA bit from the last UA probe so a transient HTTP blip
  doesn't clobber the authoritative UA reading.

- PeerUaProbeLoop (Stream B.2) — authoritative layer at 10 s / 5 s
  timeout. Calls DiscoveryClient.GetEndpoints against opc.tcp://{Host}:
  {OpcUaPort} — cheap compared to a full Session.Create, no cert trust
  required. Short-circuits when the HTTP probe last reported the peer
  unhealthy (no wasted handshakes on a known-dead endpoint), clearing
  the stale UaHealthy bit in that case.

Both inherit from BackgroundService, follow the tick/delay/catch pattern
RedundancyPublisherHostedService + ResilienceStatusPublisherHostedService
established, and expose TickAsync() as internal for test drive-through.

New PeerProbeOptions class carries the four intervals/timeouts so
operators can tune cadence per site. Registered as singleton in Program.cs;
HTTP client registered by name so the OtOpcUa handler chain
(Serilog enrichers, potential future OpenTelemetry instrumentation) isn't
bypassed.

Tests — 9 new unit tests across PeerHttpProbeLoopTests (5) and
PeerUaProbeLoopTests (4). All pass. Server.Tests total 243 → 252.
Full solution build clean.

Docs: v2-release-readiness.md Phase 6.3 follow-ups list marks the
peer-probe bullet struck-through with a close-out note.

Still deferred in Phase 6.3:
  - OPC UA variable-node binding (task #117 — ServiceLevel + ServerUriArray)
  - sp_PublishGeneration lease wrap (task #118)
  - Client interop matrix (task #119)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-24 14:53:38 -04:00
parent 96918b148c
commit de77d42eab
8 changed files with 581 additions and 1 deletions

View File

@@ -0,0 +1,161 @@
using System.Net;
using System.Net.Http;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
using ZB.MOM.WW.OtOpcUa.Server.Hosting;
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
/// <summary>
/// Unit tests for <see cref="PeerHttpProbeLoop"/>. Drives <c>TickAsync</c> synchronously
/// via a <see cref="IHttpClientFactory"/> test double so we don't race the loop's
/// <c>Task.Delay</c>.
/// </summary>
[Trait("Category", "Unit")]
public sealed class PeerHttpProbeLoopTests : IDisposable
{
private readonly OtOpcUaConfigDbContext _db;
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
public PeerHttpProbeLoopTests()
{
var opts = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
.UseInMemoryDatabase($"peer-http-{Guid.NewGuid():N}")
.Options;
_db = new OtOpcUaConfigDbContext(opts);
_dbFactory = new DbContextFactory(opts);
}
public void Dispose() => _db.Dispose();
[Fact]
public async Task Tick_with_no_peers_is_a_no_op()
{
var tracker = new PeerReachabilityTracker();
var coordinator = await SeedAndInitializeAsync("A", ("A", RedundancyRole.Primary, "urn:A"));
var loop = new PeerHttpProbeLoop(coordinator, tracker,
new StubHttpClientFactory(_ => new HttpResponseMessage(HttpStatusCode.OK)),
NullLogger<PeerHttpProbeLoop>.Instance);
await loop.TickAsync(CancellationToken.None);
tracker.Get("B").ShouldBe(PeerReachability.Unknown);
}
[Fact]
public async Task Tick_marks_peer_healthy_when_healthz_returns_200()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
var factory = new StubHttpClientFactory(req =>
{
req.RequestUri!.AbsolutePath.ShouldBe("/healthz");
return new HttpResponseMessage(HttpStatusCode.OK);
});
var loop = new PeerHttpProbeLoop(coordinator, tracker, factory, NullLogger<PeerHttpProbeLoop>.Instance);
await loop.TickAsync(CancellationToken.None);
tracker.Get("B").HttpHealthy.ShouldBeTrue();
}
[Fact]
public async Task Tick_marks_peer_unhealthy_when_healthz_throws()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
var factory = new StubHttpClientFactory(_ => throw new HttpRequestException("no route to host"));
var loop = new PeerHttpProbeLoop(coordinator, tracker, factory, NullLogger<PeerHttpProbeLoop>.Instance);
await loop.TickAsync(CancellationToken.None);
tracker.Get("B").HttpHealthy.ShouldBeFalse();
}
[Fact]
public async Task Tick_preserves_UaHealthy_bit_when_flipping_HttpHealthy()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
tracker.Update("B", new PeerReachability(HttpHealthy: false, UaHealthy: true));
var factory = new StubHttpClientFactory(_ => new HttpResponseMessage(HttpStatusCode.OK));
var loop = new PeerHttpProbeLoop(coordinator, tracker, factory, NullLogger<PeerHttpProbeLoop>.Instance);
await loop.TickAsync(CancellationToken.None);
var current = tracker.Get("B");
current.HttpHealthy.ShouldBeTrue();
current.UaHealthy.ShouldBeTrue("UA bit must not be clobbered by the HTTP probe");
}
[Fact]
public async Task Tick_marks_peer_unhealthy_on_non_2xx_response()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
var factory = new StubHttpClientFactory(_ => new HttpResponseMessage(HttpStatusCode.ServiceUnavailable));
var loop = new PeerHttpProbeLoop(coordinator, tracker, factory, NullLogger<PeerHttpProbeLoop>.Instance);
await loop.TickAsync(CancellationToken.None);
tracker.Get("B").HttpHealthy.ShouldBeFalse();
}
// ---- fixture helpers ---------------------------------------------------
private async Task<RedundancyCoordinator> SeedAndInitializeAsync(string selfNodeId, params (string id, RedundancyRole role, string appUri)[] nodes)
{
_db.ServerClusters.Add(new ServerCluster
{
ClusterId = "c1", Name = "Warsaw", Enterprise = "zb", Site = "warsaw",
RedundancyMode = nodes.Length == 1 ? RedundancyMode.None : RedundancyMode.Warm,
CreatedBy = "test",
});
foreach (var (id, role, appUri) in nodes)
{
_db.ClusterNodes.Add(new ClusterNode
{
NodeId = id, ClusterId = "c1",
RedundancyRole = role, Host = id.ToLowerInvariant(),
ApplicationUri = appUri, CreatedBy = "test",
});
}
await _db.SaveChangesAsync();
var coordinator = new RedundancyCoordinator(_dbFactory, NullLogger<RedundancyCoordinator>.Instance, selfNodeId, "c1");
await coordinator.InitializeAsync(CancellationToken.None);
return coordinator;
}
private sealed class DbContextFactory(DbContextOptions<OtOpcUaConfigDbContext> options)
: IDbContextFactory<OtOpcUaConfigDbContext>
{
public OtOpcUaConfigDbContext CreateDbContext() => new(options);
}
private sealed class StubHttpClientFactory(Func<HttpRequestMessage, HttpResponseMessage> respond) : IHttpClientFactory
{
public HttpClient CreateClient(string name) =>
new(new StubHandler(respond), disposeHandler: true) { Timeout = TimeSpan.FromSeconds(1) };
private sealed class StubHandler(Func<HttpRequestMessage, HttpResponseMessage> respond) : HttpMessageHandler
{
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
=> Task.FromResult(respond(request));
}
}
}

View File

@@ -0,0 +1,146 @@
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
using ZB.MOM.WW.OtOpcUa.Server.Hosting;
using ZB.MOM.WW.OtOpcUa.Server.Redundancy;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
/// <summary>
/// Unit tests for <see cref="PeerUaProbeLoop"/>. Drives <c>TickAsync</c> synchronously
/// with an injected endpoint-probe delegate so no real OPC UA server is needed.
/// </summary>
[Trait("Category", "Unit")]
public sealed class PeerUaProbeLoopTests : IDisposable
{
private readonly OtOpcUaConfigDbContext _db;
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
public PeerUaProbeLoopTests()
{
var opts = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
.UseInMemoryDatabase($"peer-ua-{Guid.NewGuid():N}")
.Options;
_db = new OtOpcUaConfigDbContext(opts);
_dbFactory = new DbContextFactory(opts);
}
public void Dispose() => _db.Dispose();
[Fact]
public async Task Tick_short_circuits_when_HttpHealthy_is_false()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
tracker.Update("B", new PeerReachability(HttpHealthy: false, UaHealthy: true));
var probeCallCount = 0;
var loop = new PeerUaProbeLoop(coordinator, tracker, NullLogger<PeerUaProbeLoop>.Instance,
options: null,
endpointProbe: (_, _, _) => { probeCallCount++; return Task.FromResult(true); });
await loop.TickAsync(CancellationToken.None);
probeCallCount.ShouldBe(0, "UA probe must not run when HTTP reports the peer unhealthy");
var current = tracker.Get("B");
current.HttpHealthy.ShouldBeFalse();
current.UaHealthy.ShouldBeFalse("stale UaHealthy=true must be cleared when HTTP says dead");
}
[Fact]
public async Task Tick_marks_UaHealthy_true_when_probe_succeeds()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
tracker.Update("B", new PeerReachability(HttpHealthy: true, UaHealthy: false));
string? calledEndpoint = null;
var loop = new PeerUaProbeLoop(coordinator, tracker, NullLogger<PeerUaProbeLoop>.Instance,
options: null,
endpointProbe: (endpoint, _, _) => { calledEndpoint = endpoint; return Task.FromResult(true); });
await loop.TickAsync(CancellationToken.None);
calledEndpoint.ShouldNotBeNull();
calledEndpoint!.ShouldStartWith("opc.tcp://b:");
tracker.Get("B").UaHealthy.ShouldBeTrue();
}
[Fact]
public async Task Tick_marks_UaHealthy_false_when_probe_fails()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
tracker.Update("B", new PeerReachability(HttpHealthy: true, UaHealthy: true));
var loop = new PeerUaProbeLoop(coordinator, tracker, NullLogger<PeerUaProbeLoop>.Instance,
options: null,
endpointProbe: (_, _, _) => Task.FromResult(false));
await loop.TickAsync(CancellationToken.None);
tracker.Get("B").UaHealthy.ShouldBeFalse();
}
[Fact]
public async Task Tick_preserves_HttpHealthy_bit_across_UA_update()
{
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
tracker.Update("B", new PeerReachability(HttpHealthy: true, UaHealthy: false));
var loop = new PeerUaProbeLoop(coordinator, tracker, NullLogger<PeerUaProbeLoop>.Instance,
options: null,
endpointProbe: (_, _, _) => Task.FromResult(true));
await loop.TickAsync(CancellationToken.None);
var current = tracker.Get("B");
current.HttpHealthy.ShouldBeTrue("HTTP bit must not be clobbered by the UA probe");
current.UaHealthy.ShouldBeTrue();
}
// ---- fixture helpers ---------------------------------------------------
private async Task<RedundancyCoordinator> SeedAndInitializeAsync(string selfNodeId, params (string id, RedundancyRole role, string appUri)[] nodes)
{
_db.ServerClusters.Add(new ServerCluster
{
ClusterId = "c1", Name = "Warsaw", Enterprise = "zb", Site = "warsaw",
RedundancyMode = nodes.Length == 1 ? RedundancyMode.None : RedundancyMode.Warm,
CreatedBy = "test",
});
foreach (var (id, role, appUri) in nodes)
{
_db.ClusterNodes.Add(new ClusterNode
{
NodeId = id, ClusterId = "c1",
RedundancyRole = role, Host = id.ToLowerInvariant(),
ApplicationUri = appUri, CreatedBy = "test",
});
}
await _db.SaveChangesAsync();
var coordinator = new RedundancyCoordinator(_dbFactory, NullLogger<RedundancyCoordinator>.Instance, selfNodeId, "c1");
await coordinator.InitializeAsync(CancellationToken.None);
return coordinator;
}
private sealed class DbContextFactory(DbContextOptions<OtOpcUaConfigDbContext> options)
: IDbContextFactory<OtOpcUaConfigDbContext>
{
public OtOpcUaConfigDbContext CreateDbContext() => new(options);
}
}