using System.Linq; using ZB.MOM.WW.OtOpcUa.Host.Domain; using ZB.MOM.WW.OtOpcUa.Host.Metrics; namespace ZB.MOM.WW.OtOpcUa.Host.Status { /// /// Determines health status based on connection state and operation success rates. (DASH-003) /// public class HealthCheckService { /// /// Evaluates bridge health from runtime connectivity, recorded performance metrics, and optional /// historian/alarm integration state. /// /// The current MXAccess connection state. /// The recorded performance metrics, if available. /// Optional historian integration snapshot; pass null to skip historian health rules. /// Optional alarm integration snapshot; pass null to skip alarm health rules. /// A dashboard health snapshot describing the current service condition. public HealthInfo CheckHealth( ConnectionState connectionState, PerformanceMetrics? metrics, HistorianStatusInfo? historian = null, AlarmStatusInfo? alarms = null, RuntimeStatusInfo? runtime = null) { // Rule 1: Not connected → Unhealthy if (connectionState != ConnectionState.Connected) return new HealthInfo { Status = "Unhealthy", Message = $"MXAccess not connected (state: {connectionState})", Color = "red" }; // Rule 2b: Historian enabled but plugin did not load → Degraded if (historian != null && historian.Enabled && historian.PluginStatus != "Loaded") return new HealthInfo { Status = "Degraded", Message = $"Historian enabled but plugin status is {historian.PluginStatus}: {historian.PluginError ?? "(no error)"}", Color = "yellow" }; // Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded. // Threshold of 3 avoids flagging a single transient blip; anything beyond that means // the SDK is in a broken state that the reconnect loop isn't recovering from. if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded" && historian.ConsecutiveFailures >= 3) return new HealthInfo { Status = "Degraded", Message = $"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " + $"{historian.LastQueryError ?? "(no error)"}", Color = "yellow" }; // Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster). // Only surfaces when the operator actually configured a multi-node cluster. if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded" && historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount) return new HealthInfo { Status = "Degraded", Message = $"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " + "nodes healthy — one or more nodes are in failure cooldown", Color = "yellow" }; // Rule 2 / 2c: Success rate too low for any recorded operation if (metrics != null) { var stats = metrics.GetStatistics(); foreach (var kvp in stats) { var isHistoryOp = kvp.Key.StartsWith("HistoryRead", System.StringComparison.OrdinalIgnoreCase); // History reads are rare; drop the sample threshold so a stuck historian surfaces quickly. var sampleThreshold = isHistoryOp ? 10 : 100; if (kvp.Value.TotalCount > sampleThreshold && kvp.Value.SuccessRate < 0.5) return new HealthInfo { Status = "Degraded", Message = $"{kvp.Key} success rate is {kvp.Value.SuccessRate:P0} ({kvp.Value.TotalCount} ops)", Color = "yellow" }; } } // Rule 2d: Any alarm acknowledge write has failed since startup → Degraded (latched) if (alarms != null && alarms.TrackingEnabled && alarms.AckWriteFailures > 0) return new HealthInfo { Status = "Degraded", Message = $"Alarm acknowledge writes have failed ({alarms.AckWriteFailures} total)", Color = "yellow" }; // Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded. // Runs after the transport check so that MxAccess-disconnected remains Unhealthy via // Rule 1 without also firing the runtime rule — avoids a double-message when the // transport is the root cause of every host going Unknown/Stopped. if (runtime != null && runtime.StoppedCount > 0) { var stoppedNames = string.Join(", ", runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName)); return new HealthInfo { Status = "Degraded", Message = $"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}", Color = "yellow" }; } // Rule 3: All good return new HealthInfo { Status = "Healthy", Message = "All systems operational", Color = "green" }; } /// /// Determines whether the bridge should currently be treated as healthy. /// /// The current MXAccess connection state. /// The recorded performance metrics, if available. /// when the bridge is not unhealthy; otherwise, . public bool IsHealthy(ConnectionState connectionState, PerformanceMetrics? metrics) { var health = CheckHealth(connectionState, metrics); return health.Status != "Unhealthy"; } } }