Instrument the historian plugin with runtime query health counters and read-only cluster failover so operators can detect silent query degradation and keep serving history when a single cluster node goes down

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 14:08:32 -04:00
parent 4fe37fd1b7
commit 8f340553d9
20 changed files with 1526 additions and 32 deletions

View File

@@ -0,0 +1,181 @@
using System;
using System.Collections.Generic;
using System.Linq;
using ZB.MOM.WW.LmxOpcUa.Host.Configuration;
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
{
/// <summary>
/// Thread-safe, pure-logic endpoint picker for the Wonderware Historian cluster. Tracks which
/// configured nodes are healthy, places failed nodes in a time-bounded cooldown, and hands
/// out an ordered list of eligible candidates for the data source to try in sequence.
/// </summary>
/// <remarks>
/// Design notes:
/// <list type="bullet">
/// <item>No SDK dependency — fully unit-testable with an injected clock.</item>
/// <item>Per-node state is guarded by a single lock; operations are microsecond-scale
/// so contention is a non-issue.</item>
/// <item>Cooldown is purely passive: a node re-enters the healthy pool the next time
/// it is queried after its cooldown window elapses. There is no background probe.</item>
/// <item>Nodes are returned in configuration order so operators can express a
/// preference (primary first, fallback second).</item>
/// <item>When <see cref="HistorianConfiguration.ServerNames"/> is empty, the picker is
/// initialized with a single entry from <see cref="HistorianConfiguration.ServerName"/>
/// so legacy deployments continue to work unchanged.</item>
/// </list>
/// </remarks>
internal sealed class HistorianClusterEndpointPicker
{
private readonly Func<DateTime> _clock;
private readonly TimeSpan _cooldown;
private readonly object _lock = new object();
private readonly List<NodeEntry> _nodes;
public HistorianClusterEndpointPicker(HistorianConfiguration config)
: this(config, () => DateTime.UtcNow) { }
internal HistorianClusterEndpointPicker(HistorianConfiguration config, Func<DateTime> clock)
{
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
_cooldown = TimeSpan.FromSeconds(Math.Max(0, config.FailureCooldownSeconds));
var names = (config.ServerNames != null && config.ServerNames.Count > 0)
? config.ServerNames
: new List<string> { config.ServerName };
_nodes = names
.Where(n => !string.IsNullOrWhiteSpace(n))
.Select(n => n.Trim())
.Distinct(StringComparer.OrdinalIgnoreCase)
.Select(n => new NodeEntry { Name = n })
.ToList();
}
/// <summary>
/// Gets the total number of configured cluster nodes. Stable — nodes are never added
/// or removed after construction.
/// </summary>
public int NodeCount
{
get
{
lock (_lock)
return _nodes.Count;
}
}
/// <summary>
/// Returns an ordered snapshot of nodes currently eligible for a connection attempt,
/// with any node whose cooldown has elapsed automatically restored to the pool.
/// An empty list means all nodes are in active cooldown.
/// </summary>
public IReadOnlyList<string> GetHealthyNodes()
{
lock (_lock)
{
var now = _clock();
return _nodes
.Where(n => IsHealthyAt(n, now))
.Select(n => n.Name)
.ToList();
}
}
/// <summary>
/// Gets the count of nodes currently eligible for a connection attempt (i.e., not in cooldown).
/// </summary>
public int HealthyNodeCount
{
get
{
lock (_lock)
{
var now = _clock();
return _nodes.Count(n => IsHealthyAt(n, now));
}
}
}
/// <summary>
/// Places <paramref name="node"/> into cooldown starting at the current clock time.
/// Increments the node's failure counter and stores the latest error message for
/// surfacing on the dashboard. Unknown node names are ignored.
/// </summary>
public void MarkFailed(string node, string? error)
{
lock (_lock)
{
var entry = FindEntry(node);
if (entry == null)
return;
var now = _clock();
entry.FailureCount++;
entry.LastError = error;
entry.LastFailureTime = now;
entry.CooldownUntil = _cooldown.TotalMilliseconds > 0 ? now + _cooldown : (DateTime?)null;
}
}
/// <summary>
/// Marks <paramref name="node"/> as healthy immediately — clears any active cooldown but
/// leaves the cumulative failure counter intact for operator diagnostics. Unknown node
/// names are ignored.
/// </summary>
public void MarkHealthy(string node)
{
lock (_lock)
{
var entry = FindEntry(node);
if (entry == null)
return;
entry.CooldownUntil = null;
}
}
/// <summary>
/// Captures the current per-node state for the health dashboard. Freshly computed from
/// <see cref="_clock"/> so recently-expired cooldowns are reported as healthy.
/// </summary>
public List<HistorianClusterNodeState> SnapshotNodeStates()
{
lock (_lock)
{
var now = _clock();
return _nodes.Select(n => new HistorianClusterNodeState
{
Name = n.Name,
IsHealthy = IsHealthyAt(n, now),
CooldownUntil = IsHealthyAt(n, now) ? null : n.CooldownUntil,
FailureCount = n.FailureCount,
LastError = n.LastError,
LastFailureTime = n.LastFailureTime
}).ToList();
}
}
private static bool IsHealthyAt(NodeEntry entry, DateTime now)
{
return entry.CooldownUntil == null || entry.CooldownUntil <= now;
}
private NodeEntry? FindEntry(string node)
{
for (var i = 0; i < _nodes.Count; i++)
if (string.Equals(_nodes[i].Name, node, StringComparison.OrdinalIgnoreCase))
return _nodes[i];
return null;
}
private sealed class NodeEntry
{
public string Name { get; set; } = "";
public DateTime? CooldownUntil { get; set; }
public int FailureCount { get; set; }
public string? LastError { get; set; }
public DateTime? LastFailureTime { get; set; }
}
}
}

View File

@@ -27,20 +27,155 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
private HistorianAccess? _eventConnection;
private bool _disposed;
// Runtime query health state. Guarded by _healthLock — updated on every read
// method exit (success or failure) so the dashboard can distinguish "plugin
// loaded but never queried" from "plugin loaded and queries are failing".
private readonly object _healthLock = new object();
private long _totalSuccesses;
private long _totalFailures;
private int _consecutiveFailures;
private DateTime? _lastSuccessTime;
private DateTime? _lastFailureTime;
private string? _lastError;
private string? _activeProcessNode;
private string? _activeEventNode;
// Cluster endpoint picker — shared across process + event paths so a node that
// fails on one silo is skipped on the other. Initialized from config at construction.
private readonly HistorianClusterEndpointPicker _picker;
/// <summary>
/// Initializes a Historian reader that translates OPC UA history requests into Wonderware Historian SDK queries.
/// </summary>
/// <param name="config">The Historian SDK connection settings used for runtime history lookups.</param>
public HistorianDataSource(HistorianConfiguration config)
: this(config, new SdkHistorianConnectionFactory()) { }
: this(config, new SdkHistorianConnectionFactory(), null) { }
/// <summary>
/// Initializes a Historian reader with a custom connection factory for testing.
/// Initializes a Historian reader with a custom connection factory for testing. When
/// <paramref name="picker"/> is <see langword="null"/> a new picker is built from
/// <paramref name="config"/>, preserving backward compatibility with existing tests.
/// </summary>
internal HistorianDataSource(HistorianConfiguration config, IHistorianConnectionFactory factory)
internal HistorianDataSource(
HistorianConfiguration config,
IHistorianConnectionFactory factory,
HistorianClusterEndpointPicker? picker = null)
{
_config = config;
_factory = factory;
_picker = picker ?? new HistorianClusterEndpointPicker(config);
}
/// <summary>
/// Iterates the picker's healthy node list, cloning the configuration per attempt and
/// handing it to the factory. Marks each tried node as healthy on success or failed on
/// exception. Returns the winning connection + node name; throws when no nodes succeed.
/// </summary>
private (HistorianAccess Connection, string Node) ConnectToAnyHealthyNode(HistorianConnectionType type)
{
var candidates = _picker.GetHealthyNodes();
if (candidates.Count == 0)
{
var total = _picker.NodeCount;
throw new InvalidOperationException(
total == 0
? "No historian nodes configured"
: $"All {total} historian nodes are in cooldown — no healthy endpoints to connect to");
}
Exception? lastException = null;
foreach (var node in candidates)
{
var attemptConfig = CloneConfigWithServerName(node);
try
{
var conn = _factory.CreateAndConnect(attemptConfig, type);
_picker.MarkHealthy(node);
return (conn, node);
}
catch (Exception ex)
{
_picker.MarkFailed(node, ex.Message);
lastException = ex;
Log.Warning(ex,
"Historian node {Node} failed during connect attempt; trying next candidate", node);
}
}
var inner = lastException?.Message ?? "(no detail)";
throw new InvalidOperationException(
$"All {candidates.Count} healthy historian candidate(s) failed during connect: {inner}",
lastException);
}
private HistorianConfiguration CloneConfigWithServerName(string serverName)
{
return new HistorianConfiguration
{
Enabled = _config.Enabled,
ServerName = serverName,
ServerNames = _config.ServerNames,
FailureCooldownSeconds = _config.FailureCooldownSeconds,
IntegratedSecurity = _config.IntegratedSecurity,
UserName = _config.UserName,
Password = _config.Password,
Port = _config.Port,
CommandTimeoutSeconds = _config.CommandTimeoutSeconds,
MaxValuesPerRead = _config.MaxValuesPerRead
};
}
/// <inheritdoc />
public HistorianHealthSnapshot GetHealthSnapshot()
{
var nodeStates = _picker.SnapshotNodeStates();
var healthyCount = 0;
foreach (var n in nodeStates)
if (n.IsHealthy)
healthyCount++;
lock (_healthLock)
{
return new HistorianHealthSnapshot
{
TotalQueries = _totalSuccesses + _totalFailures,
TotalSuccesses = _totalSuccesses,
TotalFailures = _totalFailures,
ConsecutiveFailures = _consecutiveFailures,
LastSuccessTime = _lastSuccessTime,
LastFailureTime = _lastFailureTime,
LastError = _lastError,
ProcessConnectionOpen = Volatile.Read(ref _connection) != null,
EventConnectionOpen = Volatile.Read(ref _eventConnection) != null,
ActiveProcessNode = _activeProcessNode,
ActiveEventNode = _activeEventNode,
NodeCount = nodeStates.Count,
HealthyNodeCount = healthyCount,
Nodes = nodeStates
};
}
}
private void RecordSuccess()
{
lock (_healthLock)
{
_totalSuccesses++;
_lastSuccessTime = DateTime.UtcNow;
_consecutiveFailures = 0;
_lastError = null;
}
}
private void RecordFailure(string error)
{
lock (_healthLock)
{
_totalFailures++;
_lastFailureTime = DateTime.UtcNow;
_consecutiveFailures++;
_lastError = error;
}
}
private void EnsureConnected()
@@ -53,8 +188,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
return;
// Create and wait for connection outside the lock so concurrent history
// requests are not serialized behind a slow Historian handshake.
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Process);
// requests are not serialized behind a slow Historian handshake. The cluster
// picker iterates configured nodes and returns the first that successfully connects.
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Process);
lock (_connectionLock)
{
@@ -74,7 +210,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_connection = conn;
Log.Information("Historian SDK connection opened to {Server}:{Port}", _config.ServerName, _config.Port);
lock (_healthLock)
_activeProcessNode = winningNode;
Log.Information("Historian SDK connection opened to {Server}:{Port}", winningNode, _config.Port);
}
}
@@ -96,7 +234,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_connection = null;
Log.Warning(ex, "Historian SDK connection reset — will reconnect on next request");
string? failedNode;
lock (_healthLock)
{
failedNode = _activeProcessNode;
_activeProcessNode = null;
}
if (failedNode != null)
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
Log.Warning(ex, "Historian SDK connection reset (node={Node}) — will reconnect on next request",
failedNode ?? "(unknown)");
}
}
@@ -108,7 +256,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (Volatile.Read(ref _eventConnection) != null)
return;
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Event);
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Event);
lock (_eventConnectionLock)
{
@@ -127,8 +275,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_eventConnection = conn;
lock (_healthLock)
_activeEventNode = winningNode;
Log.Information("Historian SDK event connection opened to {Server}:{Port}",
_config.ServerName, _config.Port);
winningNode, _config.Port);
}
}
@@ -150,7 +300,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_eventConnection = null;
Log.Warning(ex, "Historian SDK event connection reset — will reconnect on next request");
string? failedNode;
lock (_healthLock)
{
failedNode = _activeEventNode;
_activeEventNode = null;
}
if (failedNode != null)
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
Log.Warning(ex, "Historian SDK event connection reset (node={Node}) — will reconnect on next request",
failedNode ?? "(unknown)");
}
}
@@ -183,6 +343,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK raw query start failed for {Tag}: {Error}", tagName, error.ErrorCode);
RecordFailure($"raw StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
}
@@ -219,6 +380,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -231,6 +393,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead raw failed for {Tag}", tagName);
RecordFailure($"raw: {ex.Message}");
HandleConnectionError(ex);
}
@@ -265,6 +428,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
{
Log.Warning("Historian SDK aggregate query start failed for {Tag}: {Error}", tagName,
error.ErrorCode);
RecordFailure($"aggregate StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
}
@@ -287,6 +451,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -299,6 +464,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead aggregate failed for {Tag}", tagName);
RecordFailure($"aggregate: {ex.Message}");
HandleConnectionError(ex);
}
@@ -380,6 +546,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
query.EndQuery(out _);
}
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -392,6 +559,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead at-time failed for {Tag}", tagName);
RecordFailure($"at-time: {ex.Message}");
HandleConnectionError(ex);
}
@@ -430,6 +598,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK event query start failed: {Error}", error.ErrorCode);
RecordFailure($"events StartQuery: {error.ErrorCode}");
HandleEventConnectionError();
return Task.FromResult(results);
}
@@ -445,6 +614,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -457,6 +627,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead events failed for source {Source}", sourceName ?? "(all)");
RecordFailure($"events: {ex.Message}");
HandleEventConnectionError(ex);
}

View File

@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.Data.SqlClient;
using System.Linq;
using Opc.Ua;
@@ -127,20 +128,39 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
Log.Warning("Only the 'None' security profile is configured — transport security is disabled");
// Historian
Log.Information("Historian.Enabled={Enabled}, ServerName={ServerName}, IntegratedSecurity={IntegratedSecurity}, Port={Port}",
config.Historian.Enabled, config.Historian.ServerName, config.Historian.IntegratedSecurity,
var clusterNodes = config.Historian.ServerNames ?? new List<string>();
var effectiveNodes = clusterNodes.Count > 0
? string.Join(",", clusterNodes)
: config.Historian.ServerName;
Log.Information(
"Historian.Enabled={Enabled}, Nodes=[{Nodes}], IntegratedSecurity={IntegratedSecurity}, Port={Port}",
config.Historian.Enabled, effectiveNodes, config.Historian.IntegratedSecurity,
config.Historian.Port);
Log.Information("Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}",
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead);
Log.Information(
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}",
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead,
config.Historian.FailureCooldownSeconds);
if (config.Historian.Enabled)
{
if (string.IsNullOrWhiteSpace(config.Historian.ServerName))
if (clusterNodes.Count == 0 && string.IsNullOrWhiteSpace(config.Historian.ServerName))
{
Log.Error("Historian.ServerName must not be empty when Historian is enabled");
Log.Error("Historian.ServerName (or ServerNames) must not be empty when Historian is enabled");
valid = false;
}
if (config.Historian.FailureCooldownSeconds < 0)
{
Log.Error("Historian.FailureCooldownSeconds must be zero or positive");
valid = false;
}
if (clusterNodes.Count > 0 && !string.IsNullOrWhiteSpace(config.Historian.ServerName)
&& config.Historian.ServerName != "localhost")
Log.Warning(
"Historian.ServerName='{ServerName}' is ignored because Historian.ServerNames has {Count} entries",
config.Historian.ServerName, clusterNodes.Count);
if (config.Historian.Port < 1 || config.Historian.Port > 65535)
{
Log.Error("Historian.Port must be between 1 and 65535");

View File

@@ -1,3 +1,5 @@
using System.Collections.Generic;
namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
{
/// <summary>
@@ -11,10 +13,25 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
public bool Enabled { get; set; } = false;
/// <summary>
/// Gets or sets the Historian server hostname.
/// Gets or sets the single Historian server hostname used when <see cref="ServerNames"/>
/// is empty. Preserved for backward compatibility with pre-cluster deployments.
/// </summary>
public string ServerName { get; set; } = "localhost";
/// <summary>
/// Gets or sets the ordered list of Historian cluster nodes. When non-empty, this list
/// supersedes <see cref="ServerName"/>: the data source attempts each node in order on
/// connect, falling through to the next on failure. A failed node is placed in cooldown
/// for <see cref="FailureCooldownSeconds"/> before being re-eligible.
/// </summary>
public List<string> ServerNames { get; set; } = new();
/// <summary>
/// Gets or sets the cooldown window, in seconds, that a historian node is skipped after
/// a connection failure. A value of zero retries the node on every request. Default 60s.
/// </summary>
public int FailureCooldownSeconds { get; set; } = 60;
/// <summary>
/// Gets or sets a value indicating whether Windows Integrated Security is used.
/// When false, <see cref="UserName"/> and <see cref="Password"/> are used instead.

View File

@@ -0,0 +1,49 @@
using System;
namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
{
/// <summary>
/// Point-in-time state of a single historian cluster node. One entry per configured node is
/// surfaced inside <see cref="HistorianHealthSnapshot"/> so the status dashboard can render
/// per-node health and operators can see which nodes are in cooldown.
/// </summary>
public sealed class HistorianClusterNodeState
{
/// <summary>
/// Gets or sets the configured node hostname exactly as it appears in
/// <c>HistorianConfiguration.ServerNames</c>.
/// </summary>
public string Name { get; set; } = "";
/// <summary>
/// Gets or sets a value indicating whether the node is currently eligible for new connection
/// attempts. <see langword="false"/> means the node is in its post-failure cooldown window
/// and the picker is skipping it.
/// </summary>
public bool IsHealthy { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp at which the node's cooldown expires, or
/// <see langword="null"/> when the node is not in cooldown.
/// </summary>
public DateTime? CooldownUntil { get; set; }
/// <summary>
/// Gets or sets the number of times this node has transitioned from healthy to failed
/// since startup. Does not decrement on recovery.
/// </summary>
public int FailureCount { get; set; }
/// <summary>
/// Gets or sets the message from the most recent failure, or <see langword="null"/> when
/// the node has never failed.
/// </summary>
public string? LastError { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the most recent failure, or <see langword="null"/>
/// when the node has never failed.
/// </summary>
public DateTime? LastFailureTime { get; set; }
}
}

View File

@@ -0,0 +1,97 @@
using System;
using System.Collections.Generic;
namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
{
/// <summary>
/// Point-in-time runtime health of the historian plugin, surfaced to the status dashboard
/// and health check service. Fills the gap between the load-time plugin status
/// (<see cref="HistorianPluginLoader.LastOutcome"/>) and actual query behavior so operators
/// can detect silent query degradation.
/// </summary>
public sealed class HistorianHealthSnapshot
{
/// <summary>
/// Gets or sets the total number of historian read operations attempted since startup
/// across all read paths (raw, aggregate, at-time, events).
/// </summary>
public long TotalQueries { get; set; }
/// <summary>
/// Gets or sets the total number of read operations that completed without an exception
/// being caught by the plugin's error handler. Includes empty result sets as successes —
/// the counter reflects "the SDK call returned" not "the SDK call returned data".
/// </summary>
public long TotalSuccesses { get; set; }
/// <summary>
/// Gets or sets the total number of read operations that raised an exception. Each failure
/// also resets and closes the underlying SDK connection via the existing reconnect path.
/// </summary>
public long TotalFailures { get; set; }
/// <summary>
/// Gets or sets the number of consecutive failures since the last success. Latches until
/// a successful query clears it. The health check service uses this as a degradation signal.
/// </summary>
public int ConsecutiveFailures { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the last successful read, or <see langword="null"/>
/// when no query has succeeded since startup.
/// </summary>
public DateTime? LastSuccessTime { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the last failure, or <see langword="null"/> when no
/// query has failed since startup.
/// </summary>
public DateTime? LastFailureTime { get; set; }
/// <summary>
/// Gets or sets the exception message from the most recent failure. Cleared on the next
/// successful query.
/// </summary>
public string? LastError { get; set; }
/// <summary>
/// Gets or sets a value indicating whether the plugin currently holds an open SDK
/// connection for the process (historical values) path.
/// </summary>
public bool ProcessConnectionOpen { get; set; }
/// <summary>
/// Gets or sets a value indicating whether the plugin currently holds an open SDK
/// connection for the event (alarm history) path.
/// </summary>
public bool EventConnectionOpen { get; set; }
/// <summary>
/// Gets or sets the node the plugin is currently connected to for the process path,
/// or <see langword="null"/> when no connection is open.
/// </summary>
public string? ActiveProcessNode { get; set; }
/// <summary>
/// Gets or sets the node the plugin is currently connected to for the event path,
/// or <see langword="null"/> when no event connection is open.
/// </summary>
public string? ActiveEventNode { get; set; }
/// <summary>
/// Gets or sets the total number of configured historian cluster nodes. A value of 1
/// reflects a legacy single-node deployment.
/// </summary>
public int NodeCount { get; set; }
/// <summary>
/// Gets or sets the number of configured nodes that are currently healthy (not in cooldown).
/// </summary>
public int HealthyNodeCount { get; set; }
/// <summary>
/// Gets or sets the per-node cluster state in configuration order.
/// </summary>
public List<HistorianClusterNodeState> Nodes { get; set; } = new();
}
}

View File

@@ -29,5 +29,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
Task<List<HistorianEventDto>> ReadEventsAsync(
string? sourceName, DateTime startTime, DateTime endTime, int maxEvents,
CancellationToken ct = default);
/// <summary>
/// Returns a runtime snapshot of query success/failure counters and connection state.
/// Consumed by the status dashboard and health check service so operators can detect
/// silent query degradation that the load-time plugin status can't catch.
/// </summary>
HistorianHealthSnapshot GetHealthSnapshot();
}
}

View File

@@ -190,6 +190,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
public IReadOnlyList<string> AlarmFilterPatterns =>
_alarmObjectFilter?.RawPatterns ?? Array.Empty<string>();
/// <summary>
/// Gets the runtime historian health snapshot, or <see langword="null"/> when the historian
/// plugin is not loaded. Surfaced on the status dashboard so operators can detect query
/// failures that the load-time plugin status cannot catch.
/// </summary>
public HistorianHealthSnapshot? HistorianHealth => _historianDataSource?.GetHealthSnapshot();
/// <summary>
/// Gets the number of distinct alarm conditions currently tracked (one per alarm attribute).
/// </summary>

View File

@@ -42,6 +42,33 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
Color = "yellow"
};
// Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
// Threshold of 3 avoids flagging a single transient blip; anything beyond that means
// the SDK is in a broken state that the reconnect loop isn't recovering from.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.ConsecutiveFailures >= 3)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
$"{historian.LastQueryError ?? "(no error)"}",
Color = "yellow"
};
// Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
// Only surfaces when the operator actually configured a multi-node cluster.
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
&& historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
return new HealthInfo
{
Status = "Degraded",
Message =
$"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
"nodes healthy — one or more nodes are in failure cooldown",
Color = "yellow"
};
// Rule 2 / 2c: Success rate too low for any recorded operation
if (metrics != null)
{

View File

@@ -257,6 +257,81 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
/// Gets or sets the configured historian TCP port.
/// </summary>
public int Port { get; set; }
/// <summary>
/// Gets or sets the total number of historian read queries attempted since startup.
/// </summary>
public long QueryTotal { get; set; }
/// <summary>
/// Gets or sets the number of historian queries that completed without an exception.
/// </summary>
public long QuerySuccesses { get; set; }
/// <summary>
/// Gets or sets the number of historian queries that raised an exception.
/// </summary>
public long QueryFailures { get; set; }
/// <summary>
/// Gets or sets the number of consecutive failures since the last successful query.
/// </summary>
public int ConsecutiveFailures { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the last successful query.
/// </summary>
public DateTime? LastSuccessTime { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the last query failure.
/// </summary>
public DateTime? LastFailureTime { get; set; }
/// <summary>
/// Gets or sets the exception message from the most recent failure.
/// </summary>
public string? LastQueryError { get; set; }
/// <summary>
/// Gets or sets a value indicating whether the plugin currently holds an open process-path
/// SDK connection.
/// </summary>
public bool ProcessConnectionOpen { get; set; }
/// <summary>
/// Gets or sets a value indicating whether the plugin currently holds an open event-path
/// SDK connection.
/// </summary>
public bool EventConnectionOpen { get; set; }
/// <summary>
/// Gets or sets the total number of configured historian cluster nodes.
/// </summary>
public int NodeCount { get; set; }
/// <summary>
/// Gets or sets the number of cluster nodes currently eligible for new connections
/// (i.e., not in failure cooldown).
/// </summary>
public int HealthyNodeCount { get; set; }
/// <summary>
/// Gets or sets the node currently serving process (historical value) queries, or null
/// when no process connection is open.
/// </summary>
public string? ActiveProcessNode { get; set; }
/// <summary>
/// Gets or sets the node currently serving event (alarm history) queries, or null when
/// no event connection is open.
/// </summary>
public string? ActiveEventNode { get; set; }
/// <summary>
/// Gets or sets the per-node cluster state in configuration order.
/// </summary>
public List<Historian.HistorianClusterNodeState> Nodes { get; set; } = new();
}
/// <summary>

View File

@@ -125,6 +125,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
private HistorianStatusInfo BuildHistorianStatusInfo()
{
var outcome = HistorianPluginLoader.LastOutcome;
var health = _nodeManager?.HistorianHealth;
return new HistorianStatusInfo
{
Enabled = _historianConfig?.Enabled ?? false,
@@ -132,7 +133,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
PluginError = outcome.Error,
PluginPath = outcome.PluginPath,
ServerName = _historianConfig?.ServerName ?? "",
Port = _historianConfig?.Port ?? 0
Port = _historianConfig?.Port ?? 0,
QueryTotal = health?.TotalQueries ?? 0,
QuerySuccesses = health?.TotalSuccesses ?? 0,
QueryFailures = health?.TotalFailures ?? 0,
ConsecutiveFailures = health?.ConsecutiveFailures ?? 0,
LastSuccessTime = health?.LastSuccessTime,
LastFailureTime = health?.LastFailureTime,
LastQueryError = health?.LastError,
ProcessConnectionOpen = health?.ProcessConnectionOpen ?? false,
EventConnectionOpen = health?.EventConnectionOpen ?? false,
NodeCount = health?.NodeCount ?? 0,
HealthyNodeCount = health?.HealthyNodeCount ?? 0,
ActiveProcessNode = health?.ActiveProcessNode,
ActiveEventNode = health?.ActiveEventNode,
Nodes = health?.Nodes ?? new List<Historian.HistorianClusterNodeState>()
};
}
@@ -304,13 +319,66 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
sb.AppendLine("</div>");
// Historian panel
var histColor = data.Historian.PluginStatus == "Loaded" ? "green"
: !data.Historian.Enabled ? "gray" : "red";
var anyClusterNodeFailed =
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount;
var allClusterNodesFailed =
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount == 0;
var histColor = !data.Historian.Enabled ? "gray"
: data.Historian.PluginStatus != "Loaded" ? "red"
: allClusterNodesFailed ? "red"
: data.Historian.ConsecutiveFailures >= 5 ? "red"
: anyClusterNodeFailed || data.Historian.ConsecutiveFailures > 0 ? "yellow"
: "green";
sb.AppendLine($"<div class='panel {histColor}'><h2>Historian</h2>");
sb.AppendLine(
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Server: {WebUtility.HtmlEncode(data.Historian.ServerName)}:{data.Historian.Port}</p>");
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Port: {data.Historian.Port}</p>");
if (!string.IsNullOrEmpty(data.Historian.PluginError))
sb.AppendLine($"<p>Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
sb.AppendLine($"<p>Plugin Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
if (data.Historian.PluginStatus == "Loaded")
{
sb.AppendLine(
$"<p>Queries: <b>{data.Historian.QueryTotal:N0}</b> " +
$"(Success: {data.Historian.QuerySuccesses:N0}, Failure: {data.Historian.QueryFailures:N0}) " +
$"| Consecutive Failures: <b>{data.Historian.ConsecutiveFailures}</b></p>");
var procBadge = data.Historian.ProcessConnectionOpen
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveProcessNode ?? "?")})"
: "closed";
var evtBadge = data.Historian.EventConnectionOpen
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveEventNode ?? "?")})"
: "closed";
sb.AppendLine(
$"<p>Process Conn: <b>{procBadge}</b> | Event Conn: <b>{evtBadge}</b></p>");
if (data.Historian.LastSuccessTime.HasValue)
sb.AppendLine($"<p>Last Success: {data.Historian.LastSuccessTime:O}</p>");
if (data.Historian.LastFailureTime.HasValue)
sb.AppendLine($"<p>Last Failure: {data.Historian.LastFailureTime:O}</p>");
if (!string.IsNullOrEmpty(data.Historian.LastQueryError))
sb.AppendLine(
$"<p>Last Error: <code>{WebUtility.HtmlEncode(data.Historian.LastQueryError)}</code></p>");
// Cluster table: only when a true multi-node cluster is configured.
if (data.Historian.NodeCount > 1)
{
sb.AppendLine(
$"<p><b>Cluster:</b> {data.Historian.HealthyNodeCount} of {data.Historian.NodeCount} nodes healthy</p>");
sb.AppendLine(
"<table><tr><th>Node</th><th>State</th><th>Cooldown Until</th><th>Failures</th><th>Last Error</th></tr>");
foreach (var node in data.Historian.Nodes)
{
var state = node.IsHealthy ? "healthy" : "cooldown";
var cooldown = node.CooldownUntil?.ToString("O") ?? "-";
var lastErr = WebUtility.HtmlEncode(node.LastError ?? "");
sb.AppendLine(
$"<tr><td>{WebUtility.HtmlEncode(node.Name)}</td><td>{state}</td>" +
$"<td>{cooldown}</td><td>{node.FailureCount}</td><td><code>{lastErr}</code></td></tr>");
}
sb.AppendLine("</table>");
}
else if (data.Historian.NodeCount == 1)
{
sb.AppendLine($"<p>Node: {WebUtility.HtmlEncode(data.Historian.Nodes[0].Name)}</p>");
}
}
sb.AppendLine("</div>");
// Alarms panel

View File

@@ -75,6 +75,8 @@
"Historian": {
"Enabled": false,
"ServerName": "localhost",
"ServerNames": [],
"FailureCooldownSeconds": 60,
"IntegratedSecurity": true,
"UserName": null,
"Password": null,