Renames all 11 projects (5 src + 6 tests), the .slnx solution file, all source-file namespaces, all axaml namespace references, and all v1 documentation references in CLAUDE.md and docs/*.md (excluding docs/v2/ which is already in OtOpcUa form). Also updates the TopShelf service registration name from "LmxOpcUa" to "OtOpcUa" per Phase 0 Task 0.6.
Preserves runtime identifiers per Phase 0 Out-of-Scope rules to avoid breaking v1/v2 client trust during coexistence: OPC UA `ApplicationUri` defaults (`urn:{GalaxyName}:LmxOpcUa`), server `EndpointPath` (`/LmxOpcUa`), `ServerName` default (feeds cert subject CN), `MxAccessConfiguration.ClientName` default (defensive — stays "LmxOpcUa" for MxAccess audit-trail consistency), client OPC UA identifiers (`ApplicationName = "LmxOpcUaClient"`, `ApplicationUri = "urn:localhost:LmxOpcUaClient"`, cert directory `%LocalAppData%\LmxOpcUaClient\pki\`), and the `LmxOpcUaServer` class name (class rename out of Phase 0 scope per Task 0.5 sed pattern; happens in Phase 1 alongside `LmxNodeManager → GenericDriverNodeManager` Core extraction). 23 LmxOpcUa references retained, all enumerated and justified in `docs/v2/implementation/exit-gate-phase-0.md`.
Build clean: 0 errors, 30 warnings (lower than baseline 167). Tests at strict improvement over baseline: 821 passing / 1 failing vs baseline 820 / 2 (one flaky pre-existing failure passed this run; the other still fails — both pre-existing and unrelated to the rename). `Client.UI.Tests`, `Historian.Aveva.Tests`, `Client.Shared.Tests`, `IntegrationTests` all match baseline exactly. Exit gate compliance results recorded in `docs/v2/implementation/exit-gate-phase-0.md` with all 7 checks PASS or DEFERRED-to-PR-review (#7 service install verification needs Windows service permissions on the reviewer's box).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
142 lines
6.9 KiB
C#
142 lines
6.9 KiB
C#
using System.Linq;
|
|
using ZB.MOM.WW.OtOpcUa.Host.Domain;
|
|
using ZB.MOM.WW.OtOpcUa.Host.Metrics;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Host.Status
|
|
{
|
|
/// <summary>
|
|
/// Determines health status based on connection state and operation success rates. (DASH-003)
|
|
/// </summary>
|
|
public class HealthCheckService
|
|
{
|
|
/// <summary>
|
|
/// Evaluates bridge health from runtime connectivity, recorded performance metrics, and optional
|
|
/// historian/alarm integration state.
|
|
/// </summary>
|
|
/// <param name="connectionState">The current MXAccess connection state.</param>
|
|
/// <param name="metrics">The recorded performance metrics, if available.</param>
|
|
/// <param name="historian">Optional historian integration snapshot; pass <c>null</c> to skip historian health rules.</param>
|
|
/// <param name="alarms">Optional alarm integration snapshot; pass <c>null</c> to skip alarm health rules.</param>
|
|
/// <returns>A dashboard health snapshot describing the current service condition.</returns>
|
|
public HealthInfo CheckHealth(
|
|
ConnectionState connectionState,
|
|
PerformanceMetrics? metrics,
|
|
HistorianStatusInfo? historian = null,
|
|
AlarmStatusInfo? alarms = null,
|
|
RuntimeStatusInfo? runtime = null)
|
|
{
|
|
// Rule 1: Not connected → Unhealthy
|
|
if (connectionState != ConnectionState.Connected)
|
|
return new HealthInfo
|
|
{
|
|
Status = "Unhealthy",
|
|
Message = $"MXAccess not connected (state: {connectionState})",
|
|
Color = "red"
|
|
};
|
|
|
|
// Rule 2b: Historian enabled but plugin did not load → Degraded
|
|
if (historian != null && historian.Enabled && historian.PluginStatus != "Loaded")
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message =
|
|
$"Historian enabled but plugin status is {historian.PluginStatus}: {historian.PluginError ?? "(no error)"}",
|
|
Color = "yellow"
|
|
};
|
|
|
|
// Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
|
|
// Threshold of 3 avoids flagging a single transient blip; anything beyond that means
|
|
// the SDK is in a broken state that the reconnect loop isn't recovering from.
|
|
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
|
|
&& historian.ConsecutiveFailures >= 3)
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message =
|
|
$"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
|
|
$"{historian.LastQueryError ?? "(no error)"}",
|
|
Color = "yellow"
|
|
};
|
|
|
|
// Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
|
|
// Only surfaces when the operator actually configured a multi-node cluster.
|
|
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
|
|
&& historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message =
|
|
$"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
|
|
"nodes healthy — one or more nodes are in failure cooldown",
|
|
Color = "yellow"
|
|
};
|
|
|
|
// Rule 2 / 2c: Success rate too low for any recorded operation
|
|
if (metrics != null)
|
|
{
|
|
var stats = metrics.GetStatistics();
|
|
foreach (var kvp in stats)
|
|
{
|
|
var isHistoryOp = kvp.Key.StartsWith("HistoryRead", System.StringComparison.OrdinalIgnoreCase);
|
|
// History reads are rare; drop the sample threshold so a stuck historian surfaces quickly.
|
|
var sampleThreshold = isHistoryOp ? 10 : 100;
|
|
if (kvp.Value.TotalCount > sampleThreshold && kvp.Value.SuccessRate < 0.5)
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message =
|
|
$"{kvp.Key} success rate is {kvp.Value.SuccessRate:P0} ({kvp.Value.TotalCount} ops)",
|
|
Color = "yellow"
|
|
};
|
|
}
|
|
}
|
|
|
|
// Rule 2d: Any alarm acknowledge write has failed since startup → Degraded (latched)
|
|
if (alarms != null && alarms.TrackingEnabled && alarms.AckWriteFailures > 0)
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message = $"Alarm acknowledge writes have failed ({alarms.AckWriteFailures} total)",
|
|
Color = "yellow"
|
|
};
|
|
|
|
// Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
|
|
// Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
|
|
// Rule 1 without also firing the runtime rule — avoids a double-message when the
|
|
// transport is the root cause of every host going Unknown/Stopped.
|
|
if (runtime != null && runtime.StoppedCount > 0)
|
|
{
|
|
var stoppedNames = string.Join(", ",
|
|
runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
|
|
return new HealthInfo
|
|
{
|
|
Status = "Degraded",
|
|
Message =
|
|
$"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
|
|
Color = "yellow"
|
|
};
|
|
}
|
|
|
|
// Rule 3: All good
|
|
return new HealthInfo
|
|
{
|
|
Status = "Healthy",
|
|
Message = "All systems operational",
|
|
Color = "green"
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Determines whether the bridge should currently be treated as healthy.
|
|
/// </summary>
|
|
/// <param name="connectionState">The current MXAccess connection state.</param>
|
|
/// <param name="metrics">The recorded performance metrics, if available.</param>
|
|
/// <returns><see langword="true" /> when the bridge is not unhealthy; otherwise, <see langword="false" />.</returns>
|
|
public bool IsHealthy(ConnectionState connectionState, PerformanceMetrics? metrics)
|
|
{
|
|
var health = CheckHealth(connectionState, metrics);
|
|
return health.Status != "Unhealthy";
|
|
}
|
|
}
|
|
}
|