257 lines
11 KiB
C#
257 lines
11 KiB
C#
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
|
using ZB.MOM.WW.CBDDC.Core;
|
|
using ZB.MOM.WW.CBDDC.Core.Storage;
|
|
using ZB.MOM.WW.CBDDC.Hosting.Configuration;
|
|
using ZB.MOM.WW.CBDDC.Hosting.HealthChecks;
|
|
|
|
namespace ZB.MOM.WW.CBDDC.Hosting.Tests;
|
|
|
|
public class CBDDCHealthCheckTests
|
|
{
|
|
/// <summary>
|
|
/// Verifies that health is reported as healthy when persistence is available and all peers are within lag thresholds.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task CheckHealthAsync_WhenPersistenceOkAndPeersWithinLagThreshold_ReturnsHealthyWithPayload()
|
|
{
|
|
var store = Substitute.For<IOplogStore>();
|
|
var confirmationStore = Substitute.For<IPeerOplogConfirmationStore>();
|
|
var peer1LastUpdate = DateTimeOffset.UtcNow.AddSeconds(-5);
|
|
var peer2LastUpdate = DateTimeOffset.UtcNow.AddSeconds(-2);
|
|
|
|
store.GetLatestTimestampAsync(Arg.Any<CancellationToken>()).Returns(new HlcTimestamp(1_000, 0, "node-1"));
|
|
confirmationStore.GetActiveTrackedPeersAsync(Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<string>>(new[] { "peer-1", "peer-2" }));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-1", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-1",
|
|
SourceNodeId = "source-1",
|
|
ConfirmedWall = 995,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = peer1LastUpdate,
|
|
IsActive = true
|
|
}
|
|
}));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-2", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-2",
|
|
SourceNodeId = "source-1",
|
|
ConfirmedWall = 990,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = peer2LastUpdate,
|
|
IsActive = true
|
|
}
|
|
}));
|
|
|
|
var healthCheck = new CBDDCHealthCheck(
|
|
store,
|
|
confirmationStore,
|
|
CreateOptions(lagThresholdMs: 20, criticalLagThresholdMs: 50));
|
|
|
|
var result = await healthCheck.CheckHealthAsync(new HealthCheckContext());
|
|
|
|
result.Status.ShouldBe(HealthStatus.Healthy);
|
|
result.Data["trackedPeerCount"].ShouldBe(2);
|
|
result.Data["maxLagMs"].ShouldBe(10L);
|
|
result.Data["laggingPeers"].ShouldBeOfType<List<string>>().Count.ShouldBe(0);
|
|
result.Data["peersWithNoConfirmation"].ShouldBeOfType<List<string>>().Count.ShouldBe(0);
|
|
|
|
var lastUpdates = result.Data["lastSuccessfulConfirmationUpdateByPeer"]
|
|
.ShouldBeOfType<Dictionary<string, DateTimeOffset?>>();
|
|
lastUpdates["peer-1"].ShouldBe(peer1LastUpdate);
|
|
lastUpdates["peer-2"].ShouldBe(peer2LastUpdate);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Verifies that health is reported as degraded when at least one peer is lagging or has no confirmation.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task CheckHealthAsync_WhenPeersLaggingOrUnconfirmed_ReturnsDegradedWithPayload()
|
|
{
|
|
var store = Substitute.For<IOplogStore>();
|
|
var confirmationStore = Substitute.For<IPeerOplogConfirmationStore>();
|
|
var peer1LastUpdate = DateTimeOffset.UtcNow.AddSeconds(-10);
|
|
|
|
store.GetLatestTimestampAsync(Arg.Any<CancellationToken>()).Returns(new HlcTimestamp(1_000, 0, "node-1"));
|
|
confirmationStore.GetActiveTrackedPeersAsync(Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<string>>(new[] { "peer-1", "peer-2", "peer-3" }));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-1", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-1",
|
|
SourceNodeId = "source-1",
|
|
ConfirmedWall = 960,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = peer1LastUpdate,
|
|
IsActive = true
|
|
}
|
|
}));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-2", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(Array.Empty<PeerOplogConfirmation>()));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-3", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-3",
|
|
SourceNodeId = "source-1",
|
|
ConfirmedWall = 995,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = DateTimeOffset.UtcNow.AddSeconds(-4),
|
|
IsActive = true
|
|
}
|
|
}));
|
|
|
|
var healthCheck = new CBDDCHealthCheck(
|
|
store,
|
|
confirmationStore,
|
|
CreateOptions(lagThresholdMs: 30, criticalLagThresholdMs: 100));
|
|
|
|
var result = await healthCheck.CheckHealthAsync(new HealthCheckContext());
|
|
|
|
result.Status.ShouldBe(HealthStatus.Degraded);
|
|
result.Data["trackedPeerCount"].ShouldBe(3);
|
|
result.Data["maxLagMs"].ShouldBe(40L);
|
|
result.Data["laggingPeers"].ShouldBeOfType<List<string>>().ShouldContain("peer-1");
|
|
result.Data["peersWithNoConfirmation"].ShouldBeOfType<List<string>>().ShouldContain("peer-2");
|
|
|
|
var lastUpdates = result.Data["lastSuccessfulConfirmationUpdateByPeer"]
|
|
.ShouldBeOfType<Dictionary<string, DateTimeOffset?>>();
|
|
lastUpdates["peer-1"].ShouldBe(peer1LastUpdate);
|
|
lastUpdates["peer-2"].ShouldBeNull();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Verifies that health is reported as unhealthy when critical lag threshold is exceeded.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task CheckHealthAsync_WhenCriticalLagBreached_ReturnsUnhealthyWithPayload()
|
|
{
|
|
var store = Substitute.For<IOplogStore>();
|
|
var confirmationStore = Substitute.For<IPeerOplogConfirmationStore>();
|
|
|
|
store.GetLatestTimestampAsync(Arg.Any<CancellationToken>()).Returns(new HlcTimestamp(1_000, 0, "node-1"));
|
|
confirmationStore.GetActiveTrackedPeersAsync(Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<string>>(new[] { "peer-critical" }));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-critical", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-critical",
|
|
SourceNodeId = "source-1",
|
|
ConfirmedWall = 850,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = DateTimeOffset.UtcNow.AddMinutes(-1),
|
|
IsActive = true
|
|
}
|
|
}));
|
|
|
|
var healthCheck = new CBDDCHealthCheck(
|
|
store,
|
|
confirmationStore,
|
|
CreateOptions(lagThresholdMs: 30, criticalLagThresholdMs: 80));
|
|
|
|
var result = await healthCheck.CheckHealthAsync(new HealthCheckContext());
|
|
|
|
result.Status.ShouldBe(HealthStatus.Unhealthy);
|
|
result.Data["maxLagMs"].ShouldBe(150L);
|
|
result.Data["laggingPeers"].ShouldBeOfType<List<string>>().ShouldContain("peer-critical");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Verifies that worst-case lag is used when a peer has multiple source confirmations.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task CheckHealthAsync_WhenPeerHasMultipleSourceConfirmations_UsesWorstCaseLag()
|
|
{
|
|
var store = Substitute.For<IOplogStore>();
|
|
var confirmationStore = Substitute.For<IPeerOplogConfirmationStore>();
|
|
|
|
store.GetLatestTimestampAsync(Arg.Any<CancellationToken>()).Returns(new HlcTimestamp(1_000, 0, "node-1"));
|
|
confirmationStore.GetActiveTrackedPeersAsync(Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<string>>(new[] { "peer-1" }));
|
|
confirmationStore.GetConfirmationsForPeerAsync("peer-1", Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromResult<IEnumerable<PeerOplogConfirmation>>(new[]
|
|
{
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-1",
|
|
SourceNodeId = "source-fast",
|
|
ConfirmedWall = 995,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = DateTimeOffset.UtcNow.AddSeconds(-1),
|
|
IsActive = true
|
|
},
|
|
new PeerOplogConfirmation
|
|
{
|
|
PeerNodeId = "peer-1",
|
|
SourceNodeId = "source-slow",
|
|
ConfirmedWall = 900,
|
|
ConfirmedLogic = 0,
|
|
LastConfirmedUtc = DateTimeOffset.UtcNow.AddSeconds(-10),
|
|
IsActive = true
|
|
}
|
|
}));
|
|
|
|
var healthCheck = new CBDDCHealthCheck(
|
|
store,
|
|
confirmationStore,
|
|
CreateOptions(lagThresholdMs: 80, criticalLagThresholdMs: 150));
|
|
|
|
var result = await healthCheck.CheckHealthAsync(new HealthCheckContext());
|
|
|
|
result.Status.ShouldBe(HealthStatus.Degraded);
|
|
result.Data["maxLagMs"].ShouldBe(100L);
|
|
result.Data["laggingPeers"].ShouldBeOfType<List<string>>().ShouldContain("peer-1");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Verifies that health is reported as unhealthy when the persistence store throws.
|
|
/// </summary>
|
|
[Fact]
|
|
public async Task CheckHealthAsync_WhenStoreThrows_ReturnsUnhealthy()
|
|
{
|
|
var store = Substitute.For<IOplogStore>();
|
|
var confirmationStore = Substitute.For<IPeerOplogConfirmationStore>();
|
|
var error = new InvalidOperationException("store unavailable");
|
|
|
|
store.GetLatestTimestampAsync(Arg.Any<CancellationToken>())
|
|
.Returns(Task.FromException<HlcTimestamp>(error));
|
|
|
|
var healthCheck = new CBDDCHealthCheck(
|
|
store,
|
|
confirmationStore,
|
|
CreateOptions());
|
|
|
|
var result = await healthCheck.CheckHealthAsync(new HealthCheckContext());
|
|
|
|
result.Status.ShouldBe(HealthStatus.Unhealthy);
|
|
result.Exception.ShouldBe(error);
|
|
result.Description.ShouldNotBeNull();
|
|
result.Description.ShouldContain("persistence layer is unavailable");
|
|
}
|
|
|
|
private static CBDDCHostingOptions CreateOptions(
|
|
long lagThresholdMs = 30_000,
|
|
long criticalLagThresholdMs = 120_000)
|
|
{
|
|
return new CBDDCHostingOptions
|
|
{
|
|
Cluster = new ClusterOptions
|
|
{
|
|
PeerConfirmationLagThresholdMs = lagThresholdMs,
|
|
PeerConfirmationCriticalLagThresholdMs = criticalLagThresholdMs
|
|
}
|
|
};
|
|
}
|
|
}
|