using Microsoft.Extensions.Diagnostics.HealthChecks;
using ZB.MOM.WW.CBDDC.Core.Storage;
using ZB.MOM.WW.CBDDC.Hosting.Configuration;
namespace ZB.MOM.WW.CBDDC.Hosting.HealthChecks;
///
/// Health check for CBDDC persistence layer.
/// Verifies that the database connection is healthy.
///
public class CBDDCHealthCheck : IHealthCheck
{
private readonly IOplogStore _oplogStore;
private readonly CBDDCHostingOptions _options;
private readonly IPeerOplogConfirmationStore _peerOplogConfirmationStore;
///
/// Initializes a new instance of the class.
///
/// The oplog store used to verify persistence health.
/// The peer confirmation store used for confirmation lag health checks.
/// Hosting options containing health lag thresholds.
public CBDDCHealthCheck(
IOplogStore oplogStore,
IPeerOplogConfirmationStore peerOplogConfirmationStore,
CBDDCHostingOptions options)
{
_oplogStore = oplogStore ?? throw new ArgumentNullException(nameof(oplogStore));
_peerOplogConfirmationStore = peerOplogConfirmationStore ??
throw new ArgumentNullException(nameof(peerOplogConfirmationStore));
_options = options ?? throw new ArgumentNullException(nameof(options));
}
///
/// Performs a health check against the CBDDC persistence layer.
///
/// The health check execution context.
/// A token used to cancel the health check.
/// A describing the health status.
public async Task CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
var localHead = await _oplogStore.GetLatestTimestampAsync(cancellationToken);
var trackedPeers = (await _peerOplogConfirmationStore.GetActiveTrackedPeersAsync(cancellationToken))
.Where(peerNodeId => !string.IsNullOrWhiteSpace(peerNodeId))
.Distinct(StringComparer.Ordinal)
.OrderBy(peerNodeId => peerNodeId, StringComparer.Ordinal)
.ToList();
var peersWithNoConfirmation = new List();
var laggingPeers = new List();
var criticalLaggingPeers = new List();
var lastSuccessfulConfirmationUpdateByPeer =
new Dictionary(StringComparer.Ordinal);
var maxLagMs = 0L;
long lagThresholdMs = Math.Max(0, _options.Cluster.PeerConfirmationLagThresholdMs);
long criticalLagThresholdMs =
Math.Max(lagThresholdMs, _options.Cluster.PeerConfirmationCriticalLagThresholdMs);
foreach (string peerNodeId in trackedPeers)
{
var confirmations =
(await _peerOplogConfirmationStore.GetConfirmationsForPeerAsync(peerNodeId, cancellationToken))
.Where(confirmation => confirmation.IsActive)
.ToList();
if (confirmations.Count == 0)
{
peersWithNoConfirmation.Add(peerNodeId);
lastSuccessfulConfirmationUpdateByPeer[peerNodeId] = null;
continue;
}
// Report worst-case peer lag across source streams.
var oldestConfirmation = confirmations
.OrderBy(confirmation => confirmation.ConfirmedWall)
.ThenBy(confirmation => confirmation.ConfirmedLogic)
.First();
long lagMs = Math.Max(0, localHead.PhysicalTime - oldestConfirmation.ConfirmedWall);
maxLagMs = Math.Max(maxLagMs, lagMs);
lastSuccessfulConfirmationUpdateByPeer[peerNodeId] =
confirmations.Max(confirmation => confirmation.LastConfirmedUtc);
if (lagMs > lagThresholdMs) laggingPeers.Add(peerNodeId);
if (lagMs > criticalLagThresholdMs) criticalLaggingPeers.Add(peerNodeId);
}
var payload = new Dictionary
{
["trackedPeerCount"] = trackedPeers.Count,
["peersWithNoConfirmation"] = peersWithNoConfirmation,
["maxLagMs"] = maxLagMs,
["laggingPeers"] = laggingPeers,
["lastSuccessfulConfirmationUpdateByPeer"] = lastSuccessfulConfirmationUpdateByPeer
};
if (criticalLaggingPeers.Count > 0)
return HealthCheckResult.Unhealthy(
$"CBDDC is unhealthy. Critical lag detected for {criticalLaggingPeers.Count} tracked peer(s).",
data: payload);
if (peersWithNoConfirmation.Count > 0 || laggingPeers.Count > 0)
return HealthCheckResult.Degraded(
$"CBDDC is degraded. Lagging peers: {laggingPeers.Count}, unconfirmed peers: {peersWithNoConfirmation.Count}.",
data: payload);
return HealthCheckResult.Healthy(
$"CBDDC is healthy. Latest timestamp: {localHead.PhysicalTime}.",
payload);
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"CBDDC persistence layer is unavailable",
ex);
}
}
}