Instrument the historian plugin with runtime query health counters and read-only cluster failover so operators can detect silent query degradation and keep serving history when a single cluster node goes down

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 14:08:32 -04:00
parent 4fe37fd1b7
commit 8f340553d9
20 changed files with 1526 additions and 32 deletions

View File

@@ -125,6 +125,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
private HistorianStatusInfo BuildHistorianStatusInfo()
{
var outcome = HistorianPluginLoader.LastOutcome;
var health = _nodeManager?.HistorianHealth;
return new HistorianStatusInfo
{
Enabled = _historianConfig?.Enabled ?? false,
@@ -132,7 +133,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
PluginError = outcome.Error,
PluginPath = outcome.PluginPath,
ServerName = _historianConfig?.ServerName ?? "",
Port = _historianConfig?.Port ?? 0
Port = _historianConfig?.Port ?? 0,
QueryTotal = health?.TotalQueries ?? 0,
QuerySuccesses = health?.TotalSuccesses ?? 0,
QueryFailures = health?.TotalFailures ?? 0,
ConsecutiveFailures = health?.ConsecutiveFailures ?? 0,
LastSuccessTime = health?.LastSuccessTime,
LastFailureTime = health?.LastFailureTime,
LastQueryError = health?.LastError,
ProcessConnectionOpen = health?.ProcessConnectionOpen ?? false,
EventConnectionOpen = health?.EventConnectionOpen ?? false,
NodeCount = health?.NodeCount ?? 0,
HealthyNodeCount = health?.HealthyNodeCount ?? 0,
ActiveProcessNode = health?.ActiveProcessNode,
ActiveEventNode = health?.ActiveEventNode,
Nodes = health?.Nodes ?? new List<Historian.HistorianClusterNodeState>()
};
}
@@ -304,13 +319,66 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
sb.AppendLine("</div>");
// Historian panel
var histColor = data.Historian.PluginStatus == "Loaded" ? "green"
: !data.Historian.Enabled ? "gray" : "red";
var anyClusterNodeFailed =
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount;
var allClusterNodesFailed =
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount == 0;
var histColor = !data.Historian.Enabled ? "gray"
: data.Historian.PluginStatus != "Loaded" ? "red"
: allClusterNodesFailed ? "red"
: data.Historian.ConsecutiveFailures >= 5 ? "red"
: anyClusterNodeFailed || data.Historian.ConsecutiveFailures > 0 ? "yellow"
: "green";
sb.AppendLine($"<div class='panel {histColor}'><h2>Historian</h2>");
sb.AppendLine(
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Server: {WebUtility.HtmlEncode(data.Historian.ServerName)}:{data.Historian.Port}</p>");
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Port: {data.Historian.Port}</p>");
if (!string.IsNullOrEmpty(data.Historian.PluginError))
sb.AppendLine($"<p>Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
sb.AppendLine($"<p>Plugin Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
if (data.Historian.PluginStatus == "Loaded")
{
sb.AppendLine(
$"<p>Queries: <b>{data.Historian.QueryTotal:N0}</b> " +
$"(Success: {data.Historian.QuerySuccesses:N0}, Failure: {data.Historian.QueryFailures:N0}) " +
$"| Consecutive Failures: <b>{data.Historian.ConsecutiveFailures}</b></p>");
var procBadge = data.Historian.ProcessConnectionOpen
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveProcessNode ?? "?")})"
: "closed";
var evtBadge = data.Historian.EventConnectionOpen
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveEventNode ?? "?")})"
: "closed";
sb.AppendLine(
$"<p>Process Conn: <b>{procBadge}</b> | Event Conn: <b>{evtBadge}</b></p>");
if (data.Historian.LastSuccessTime.HasValue)
sb.AppendLine($"<p>Last Success: {data.Historian.LastSuccessTime:O}</p>");
if (data.Historian.LastFailureTime.HasValue)
sb.AppendLine($"<p>Last Failure: {data.Historian.LastFailureTime:O}</p>");
if (!string.IsNullOrEmpty(data.Historian.LastQueryError))
sb.AppendLine(
$"<p>Last Error: <code>{WebUtility.HtmlEncode(data.Historian.LastQueryError)}</code></p>");
// Cluster table: only when a true multi-node cluster is configured.
if (data.Historian.NodeCount > 1)
{
sb.AppendLine(
$"<p><b>Cluster:</b> {data.Historian.HealthyNodeCount} of {data.Historian.NodeCount} nodes healthy</p>");
sb.AppendLine(
"<table><tr><th>Node</th><th>State</th><th>Cooldown Until</th><th>Failures</th><th>Last Error</th></tr>");
foreach (var node in data.Historian.Nodes)
{
var state = node.IsHealthy ? "healthy" : "cooldown";
var cooldown = node.CooldownUntil?.ToString("O") ?? "-";
var lastErr = WebUtility.HtmlEncode(node.LastError ?? "");
sb.AppendLine(
$"<tr><td>{WebUtility.HtmlEncode(node.Name)}</td><td>{state}</td>" +
$"<td>{cooldown}</td><td>{node.FailureCount}</td><td><code>{lastErr}</code></td></tr>");
}
sb.AppendLine("</table>");
}
else if (data.Historian.NodeCount == 1)
{
sb.AppendLine($"<p>Node: {WebUtility.HtmlEncode(data.Historian.Nodes[0].Name)}</p>");
}
}
sb.AppendLine("</div>");
// Alarms panel