lmxopcua/src/ZB.MOM.WW.OtOpcUa.Host/Status/HealthCheckService.cs

using System.Linq;
using ZB.MOM.WW.OtOpcUa.Host.Domain;
using ZB.MOM.WW.OtOpcUa.Host.Metrics;

namespace ZB.MOM.WW.OtOpcUa.Host.Status
{
    /// <summary>
    ///     Determines health status based on connection state and operation success rates. (DASH-003)
    /// </summary>
    public class HealthCheckService
    {
        /// <summary>
        ///     Evaluates bridge health from runtime connectivity, recorded performance metrics, and optional
        ///     historian/alarm integration state.
        /// </summary>
        /// <param name="connectionState">The current MXAccess connection state.</param>
        /// <param name="metrics">The recorded performance metrics, if available.</param>
        /// <param name="historian">Optional historian integration snapshot; pass <c>null</c> to skip historian health rules.</param>
        /// <param name="alarms">Optional alarm integration snapshot; pass <c>null</c> to skip alarm health rules.</param>
        /// <returns>A dashboard health snapshot describing the current service condition.</returns>
        public HealthInfo CheckHealth(
            ConnectionState connectionState,
            PerformanceMetrics? metrics,
            HistorianStatusInfo? historian = null,
            AlarmStatusInfo? alarms = null,
            RuntimeStatusInfo? runtime = null)
        {
            // Rule 1: Not connected → Unhealthy
            if (connectionState != ConnectionState.Connected)
                return new HealthInfo
                {
                    Status = "Unhealthy",
                    Message = $"MXAccess not connected (state: {connectionState})",
                    Color = "red"
                };

            // Rule 2b: Historian enabled but plugin did not load → Degraded
            if (historian != null && historian.Enabled && historian.PluginStatus != "Loaded")
                return new HealthInfo
                {
                    Status = "Degraded",
                    Message =
                        $"Historian enabled but plugin status is {historian.PluginStatus}: {historian.PluginError ?? "(no error)"}",
                    Color = "yellow"
                };

            // Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
            // Threshold of 3 avoids flagging a single transient blip; anything beyond that means
            // the SDK is in a broken state that the reconnect loop isn't recovering from.
            if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
                && historian.ConsecutiveFailures >= 3)
                return new HealthInfo
                {
                    Status = "Degraded",
                    Message =
                        $"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
                        $"{historian.LastQueryError ?? "(no error)"}",
                    Color = "yellow"
                };

            // Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
            // Only surfaces when the operator actually configured a multi-node cluster.
            if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
                && historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
                return new HealthInfo
                {
                    Status = "Degraded",
                    Message =
                        $"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
                        "nodes healthy — one or more nodes are in failure cooldown",
                    Color = "yellow"
                };

            // Rule 2 / 2c: Success rate too low for any recorded operation
            if (metrics != null)
            {
                var stats = metrics.GetStatistics();
                foreach (var kvp in stats)
                {
                    var isHistoryOp = kvp.Key.StartsWith("HistoryRead", System.StringComparison.OrdinalIgnoreCase);
                    // History reads are rare; drop the sample threshold so a stuck historian surfaces quickly.
                    var sampleThreshold = isHistoryOp ? 10 : 100;
                    if (kvp.Value.TotalCount > sampleThreshold && kvp.Value.SuccessRate < 0.5)
                        return new HealthInfo
                        {
                            Status = "Degraded",
                            Message =
                                $"{kvp.Key} success rate is {kvp.Value.SuccessRate:P0} ({kvp.Value.TotalCount} ops)",
                            Color = "yellow"
                        };
                }
            }

            // Rule 2d: Any alarm acknowledge write has failed since startup → Degraded (latched)
            if (alarms != null && alarms.TrackingEnabled && alarms.AckWriteFailures > 0)
                return new HealthInfo
                {
                    Status = "Degraded",
                    Message = $"Alarm acknowledge writes have failed ({alarms.AckWriteFailures} total)",
                    Color = "yellow"
                };

            // Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
            // Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
            // Rule 1 without also firing the runtime rule — avoids a double-message when the
            // transport is the root cause of every host going Unknown/Stopped.
            if (runtime != null && runtime.StoppedCount > 0)
            {
                var stoppedNames = string.Join(", ",
                    runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
                return new HealthInfo
                {
                    Status = "Degraded",
                    Message =
                        $"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
                    Color = "yellow"
                };
            }

            // Rule 3: All good
            return new HealthInfo
            {
                Status = "Healthy",
                Message = "All systems operational",
                Color = "green"
            };
        }

        /// <summary>
        ///     Determines whether the bridge should currently be treated as healthy.
        /// </summary>
        /// <param name="connectionState">The current MXAccess connection state.</param>
        /// <param name="metrics">The recorded performance metrics, if available.</param>
        /// <returns><see langword="true" /> when the bridge is not unhealthy; otherwise, <see langword="false" />.</returns>
        public bool IsHealthy(ConnectionState connectionState, PerformanceMetrics? metrics)
        {
            var health = CheckHealth(connectionState, metrics);
            return health.Status != "Unhealthy";
        }
    }
}