using System.Linq;
using ZB.MOM.WW.OtOpcUa.Host.Domain;
using ZB.MOM.WW.OtOpcUa.Host.Metrics;
namespace ZB.MOM.WW.OtOpcUa.Host.Status
{
///
/// Determines health status based on connection state and operation success rates. (DASH-003)
///
public class HealthCheckService
{
///
/// Evaluates bridge health from runtime connectivity, recorded performance metrics, and optional
/// historian/alarm integration state.
///
/// The current MXAccess connection state.
/// The recorded performance metrics, if available.
/// Optional historian integration snapshot; pass null to skip historian health rules.
/// Optional alarm integration snapshot; pass null to skip alarm health rules.
/// A dashboard health snapshot describing the current service condition.
public HealthInfo CheckHealth(
ConnectionState connectionState,
PerformanceMetrics? metrics,
HistorianStatusInfo? historian = null,
AlarmStatusInfo? alarms = null,
RuntimeStatusInfo? runtime = null)
{
// Rule 1: Not connected → Unhealthy
if (connectionState != ConnectionState.Connected)
return new HealthInfo
{
Status = "Unhealthy",
Message = $"MXAccess not connected (state: {connectionState})",
Color = "red"
};
// Rule 2b: Historian enabled but plugin did not load → Degraded
if (historian != null && historian.Enabled && historian.PluginStatus != "Loaded")
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian enabled but plugin status is {historian.PluginStatus}: {historian.PluginError ?? "(no error)"}",
Color = "yellow"
};
// Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
// Threshold of 3 avoids flagging a single transient blip; anything beyond that means
// the SDK is in a broken state that the reconnect loop isn't recovering from.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.ConsecutiveFailures >= 3)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
$"{historian.LastQueryError ?? "(no error)"}",
Color = "yellow"
};
// Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
// Only surfaces when the operator actually configured a multi-node cluster.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
"nodes healthy — one or more nodes are in failure cooldown",
Color = "yellow"
};
// Rule 2 / 2c: Success rate too low for any recorded operation
if (metrics != null)
{
var stats = metrics.GetStatistics();
foreach (var kvp in stats)
{
var isHistoryOp = kvp.Key.StartsWith("HistoryRead", System.StringComparison.OrdinalIgnoreCase);
// History reads are rare; drop the sample threshold so a stuck historian surfaces quickly.
var sampleThreshold = isHistoryOp ? 10 : 100;
if (kvp.Value.TotalCount > sampleThreshold && kvp.Value.SuccessRate < 0.5)
return new HealthInfo
{
Status = "Degraded",
Message =
$"{kvp.Key} success rate is {kvp.Value.SuccessRate:P0} ({kvp.Value.TotalCount} ops)",
Color = "yellow"
};
}
}
// Rule 2d: Any alarm acknowledge write has failed since startup → Degraded (latched)
if (alarms != null && alarms.TrackingEnabled && alarms.AckWriteFailures > 0)
return new HealthInfo
{
Status = "Degraded",
Message = $"Alarm acknowledge writes have failed ({alarms.AckWriteFailures} total)",
Color = "yellow"
};
// Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
// Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
// Rule 1 without also firing the runtime rule — avoids a double-message when the
// transport is the root cause of every host going Unknown/Stopped.
if (runtime != null && runtime.StoppedCount > 0)
{
var stoppedNames = string.Join(", ",
runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
return new HealthInfo
{
Status = "Degraded",
Message =
$"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
Color = "yellow"
};
}
// Rule 3: All good
return new HealthInfo
{
Status = "Healthy",
Message = "All systems operational",
Color = "green"
};
}
///
/// Determines whether the bridge should currently be treated as healthy.
///
/// The current MXAccess connection state.
/// The recorded performance metrics, if available.
/// when the bridge is not unhealthy; otherwise, .
public bool IsHealthy(ConnectionState connectionState, PerformanceMetrics? metrics)
{
var health = CheckHealth(connectionState, metrics);
return health.Status != "Unhealthy";
}
}
}