Files
lmxopcua/src/ZB.MOM.WW.OtOpcUa.Host/Status/HealthCheckService.cs
Joseph Doherty 3b2defd94f Phase 0 — mechanical rename ZB.MOM.WW.LmxOpcUa.* → ZB.MOM.WW.OtOpcUa.*
Renames all 11 projects (5 src + 6 tests), the .slnx solution file, all source-file namespaces, all axaml namespace references, and all v1 documentation references in CLAUDE.md and docs/*.md (excluding docs/v2/ which is already in OtOpcUa form). Also updates the TopShelf service registration name from "LmxOpcUa" to "OtOpcUa" per Phase 0 Task 0.6.

Preserves runtime identifiers per Phase 0 Out-of-Scope rules to avoid breaking v1/v2 client trust during coexistence: OPC UA `ApplicationUri` defaults (`urn:{GalaxyName}:LmxOpcUa`), server `EndpointPath` (`/LmxOpcUa`), `ServerName` default (feeds cert subject CN), `MxAccessConfiguration.ClientName` default (defensive — stays "LmxOpcUa" for MxAccess audit-trail consistency), client OPC UA identifiers (`ApplicationName = "LmxOpcUaClient"`, `ApplicationUri = "urn:localhost:LmxOpcUaClient"`, cert directory `%LocalAppData%\LmxOpcUaClient\pki\`), and the `LmxOpcUaServer` class name (class rename out of Phase 0 scope per Task 0.5 sed pattern; happens in Phase 1 alongside `LmxNodeManager → GenericDriverNodeManager` Core extraction). 23 LmxOpcUa references retained, all enumerated and justified in `docs/v2/implementation/exit-gate-phase-0.md`.

Build clean: 0 errors, 30 warnings (lower than baseline 167). Tests at strict improvement over baseline: 821 passing / 1 failing vs baseline 820 / 2 (one flaky pre-existing failure passed this run; the other still fails — both pre-existing and unrelated to the rename). `Client.UI.Tests`, `Historian.Aveva.Tests`, `Client.Shared.Tests`, `IntegrationTests` all match baseline exactly. Exit gate compliance results recorded in `docs/v2/implementation/exit-gate-phase-0.md` with all 7 checks PASS or DEFERRED-to-PR-review (#7 service install verification needs Windows service permissions on the reviewer's box).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 13:57:47 -04:00

142 lines
6.9 KiB
C#

using System.Linq;
using ZB.MOM.WW.OtOpcUa.Host.Domain;
using ZB.MOM.WW.OtOpcUa.Host.Metrics;
namespace ZB.MOM.WW.OtOpcUa.Host.Status
{
/// <summary>
/// Determines health status based on connection state and operation success rates. (DASH-003)
/// </summary>
public class HealthCheckService
{
/// <summary>
/// Evaluates bridge health from runtime connectivity, recorded performance metrics, and optional
/// historian/alarm integration state.
/// </summary>
/// <param name="connectionState">The current MXAccess connection state.</param>
/// <param name="metrics">The recorded performance metrics, if available.</param>
/// <param name="historian">Optional historian integration snapshot; pass <c>null</c> to skip historian health rules.</param>
/// <param name="alarms">Optional alarm integration snapshot; pass <c>null</c> to skip alarm health rules.</param>
/// <returns>A dashboard health snapshot describing the current service condition.</returns>
public HealthInfo CheckHealth(
ConnectionState connectionState,
PerformanceMetrics? metrics,
HistorianStatusInfo? historian = null,
AlarmStatusInfo? alarms = null,
RuntimeStatusInfo? runtime = null)
{
// Rule 1: Not connected → Unhealthy
if (connectionState != ConnectionState.Connected)
return new HealthInfo
{
Status = "Unhealthy",
Message = $"MXAccess not connected (state: {connectionState})",
Color = "red"
};
// Rule 2b: Historian enabled but plugin did not load → Degraded
if (historian != null && historian.Enabled && historian.PluginStatus != "Loaded")
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian enabled but plugin status is {historian.PluginStatus}: {historian.PluginError ?? "(no error)"}",
Color = "yellow"
};
// Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
// Threshold of 3 avoids flagging a single transient blip; anything beyond that means
// the SDK is in a broken state that the reconnect loop isn't recovering from.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.ConsecutiveFailures >= 3)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
$"{historian.LastQueryError ?? "(no error)"}",
Color = "yellow"
};
// Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
// Only surfaces when the operator actually configured a multi-node cluster.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
"nodes healthy — one or more nodes are in failure cooldown",
Color = "yellow"
};
// Rule 2 / 2c: Success rate too low for any recorded operation
if (metrics != null)
{
var stats = metrics.GetStatistics();
foreach (var kvp in stats)
{
var isHistoryOp = kvp.Key.StartsWith("HistoryRead", System.StringComparison.OrdinalIgnoreCase);
// History reads are rare; drop the sample threshold so a stuck historian surfaces quickly.
var sampleThreshold = isHistoryOp ? 10 : 100;
if (kvp.Value.TotalCount > sampleThreshold && kvp.Value.SuccessRate < 0.5)
return new HealthInfo
{
Status = "Degraded",
Message =
$"{kvp.Key} success rate is {kvp.Value.SuccessRate:P0} ({kvp.Value.TotalCount} ops)",
Color = "yellow"
};
}
}
// Rule 2d: Any alarm acknowledge write has failed since startup → Degraded (latched)
if (alarms != null && alarms.TrackingEnabled && alarms.AckWriteFailures > 0)
return new HealthInfo
{
Status = "Degraded",
Message = $"Alarm acknowledge writes have failed ({alarms.AckWriteFailures} total)",
Color = "yellow"
};
// Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
// Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
// Rule 1 without also firing the runtime rule — avoids a double-message when the
// transport is the root cause of every host going Unknown/Stopped.
if (runtime != null && runtime.StoppedCount > 0)
{
var stoppedNames = string.Join(", ",
runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
return new HealthInfo
{
Status = "Degraded",
Message =
$"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
Color = "yellow"
};
}
// Rule 3: All good
return new HealthInfo
{
Status = "Healthy",
Message = "All systems operational",
Color = "green"
};
}
/// <summary>
/// Determines whether the bridge should currently be treated as healthy.
/// </summary>
/// <param name="connectionState">The current MXAccess connection state.</param>
/// <param name="metrics">The recorded performance metrics, if available.</param>
/// <returns><see langword="true" /> when the bridge is not unhealthy; otherwise, <see langword="false" />.</returns>
public bool IsHealthy(ConnectionState connectionState, PerformanceMetrics? metrics)
{
var health = CheckHealth(connectionState, metrics);
return health.Status != "Unhealthy";
}
}
}