using Microsoft.Extensions.Diagnostics.HealthChecks; using ZB.MOM.WW.CBDDC.Core.Storage; using ZB.MOM.WW.CBDDC.Hosting.Configuration; namespace ZB.MOM.WW.CBDDC.Hosting.HealthChecks; /// /// Health check for CBDDC persistence layer. /// Verifies that the database connection is healthy. /// public class CBDDCHealthCheck : IHealthCheck { private readonly IOplogStore _oplogStore; private readonly CBDDCHostingOptions _options; private readonly IPeerOplogConfirmationStore _peerOplogConfirmationStore; /// /// Initializes a new instance of the class. /// /// The oplog store used to verify persistence health. /// The peer confirmation store used for confirmation lag health checks. /// Hosting options containing health lag thresholds. public CBDDCHealthCheck( IOplogStore oplogStore, IPeerOplogConfirmationStore peerOplogConfirmationStore, CBDDCHostingOptions options) { _oplogStore = oplogStore ?? throw new ArgumentNullException(nameof(oplogStore)); _peerOplogConfirmationStore = peerOplogConfirmationStore ?? throw new ArgumentNullException(nameof(peerOplogConfirmationStore)); _options = options ?? throw new ArgumentNullException(nameof(options)); } /// /// Performs a health check against the CBDDC persistence layer. /// /// The health check execution context. /// A token used to cancel the health check. /// A describing the health status. public async Task CheckHealthAsync( HealthCheckContext context, CancellationToken cancellationToken = default) { try { var localHead = await _oplogStore.GetLatestTimestampAsync(cancellationToken); var trackedPeers = (await _peerOplogConfirmationStore.GetActiveTrackedPeersAsync(cancellationToken)) .Where(peerNodeId => !string.IsNullOrWhiteSpace(peerNodeId)) .Distinct(StringComparer.Ordinal) .OrderBy(peerNodeId => peerNodeId, StringComparer.Ordinal) .ToList(); var peersWithNoConfirmation = new List(); var laggingPeers = new List(); var criticalLaggingPeers = new List(); var lastSuccessfulConfirmationUpdateByPeer = new Dictionary(StringComparer.Ordinal); var maxLagMs = 0L; long lagThresholdMs = Math.Max(0, _options.Cluster.PeerConfirmationLagThresholdMs); long criticalLagThresholdMs = Math.Max(lagThresholdMs, _options.Cluster.PeerConfirmationCriticalLagThresholdMs); foreach (string peerNodeId in trackedPeers) { var confirmations = (await _peerOplogConfirmationStore.GetConfirmationsForPeerAsync(peerNodeId, cancellationToken)) .Where(confirmation => confirmation.IsActive) .ToList(); if (confirmations.Count == 0) { peersWithNoConfirmation.Add(peerNodeId); lastSuccessfulConfirmationUpdateByPeer[peerNodeId] = null; continue; } // Report worst-case peer lag across source streams. var oldestConfirmation = confirmations .OrderBy(confirmation => confirmation.ConfirmedWall) .ThenBy(confirmation => confirmation.ConfirmedLogic) .First(); long lagMs = Math.Max(0, localHead.PhysicalTime - oldestConfirmation.ConfirmedWall); maxLagMs = Math.Max(maxLagMs, lagMs); lastSuccessfulConfirmationUpdateByPeer[peerNodeId] = confirmations.Max(confirmation => confirmation.LastConfirmedUtc); if (lagMs > lagThresholdMs) laggingPeers.Add(peerNodeId); if (lagMs > criticalLagThresholdMs) criticalLaggingPeers.Add(peerNodeId); } var payload = new Dictionary { ["trackedPeerCount"] = trackedPeers.Count, ["peersWithNoConfirmation"] = peersWithNoConfirmation, ["maxLagMs"] = maxLagMs, ["laggingPeers"] = laggingPeers, ["lastSuccessfulConfirmationUpdateByPeer"] = lastSuccessfulConfirmationUpdateByPeer }; if (criticalLaggingPeers.Count > 0) return HealthCheckResult.Unhealthy( $"CBDDC is unhealthy. Critical lag detected for {criticalLaggingPeers.Count} tracked peer(s).", data: payload); if (peersWithNoConfirmation.Count > 0 || laggingPeers.Count > 0) return HealthCheckResult.Degraded( $"CBDDC is degraded. Lagging peers: {laggingPeers.Count}, unconfirmed peers: {peersWithNoConfirmation.Count}.", data: payload); return HealthCheckResult.Healthy( $"CBDDC is healthy. Latest timestamp: {localHead.PhysicalTime}.", payload); } catch (Exception ex) { return HealthCheckResult.Unhealthy( "CBDDC persistence layer is unavailable", ex); } } }