Instrument the historian plugin with runtime query health counters and read-only cluster failover so operators can detect silent query degradation and keep serving history when a single cluster node goes down
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,181 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Configuration;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
{
|
||||
/// <summary>
|
||||
/// Thread-safe, pure-logic endpoint picker for the Wonderware Historian cluster. Tracks which
|
||||
/// configured nodes are healthy, places failed nodes in a time-bounded cooldown, and hands
|
||||
/// out an ordered list of eligible candidates for the data source to try in sequence.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Design notes:
|
||||
/// <list type="bullet">
|
||||
/// <item>No SDK dependency — fully unit-testable with an injected clock.</item>
|
||||
/// <item>Per-node state is guarded by a single lock; operations are microsecond-scale
|
||||
/// so contention is a non-issue.</item>
|
||||
/// <item>Cooldown is purely passive: a node re-enters the healthy pool the next time
|
||||
/// it is queried after its cooldown window elapses. There is no background probe.</item>
|
||||
/// <item>Nodes are returned in configuration order so operators can express a
|
||||
/// preference (primary first, fallback second).</item>
|
||||
/// <item>When <see cref="HistorianConfiguration.ServerNames"/> is empty, the picker is
|
||||
/// initialized with a single entry from <see cref="HistorianConfiguration.ServerName"/>
|
||||
/// so legacy deployments continue to work unchanged.</item>
|
||||
/// </list>
|
||||
/// </remarks>
|
||||
internal sealed class HistorianClusterEndpointPicker
|
||||
{
|
||||
private readonly Func<DateTime> _clock;
|
||||
private readonly TimeSpan _cooldown;
|
||||
private readonly object _lock = new object();
|
||||
private readonly List<NodeEntry> _nodes;
|
||||
|
||||
public HistorianClusterEndpointPicker(HistorianConfiguration config)
|
||||
: this(config, () => DateTime.UtcNow) { }
|
||||
|
||||
internal HistorianClusterEndpointPicker(HistorianConfiguration config, Func<DateTime> clock)
|
||||
{
|
||||
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
|
||||
_cooldown = TimeSpan.FromSeconds(Math.Max(0, config.FailureCooldownSeconds));
|
||||
|
||||
var names = (config.ServerNames != null && config.ServerNames.Count > 0)
|
||||
? config.ServerNames
|
||||
: new List<string> { config.ServerName };
|
||||
|
||||
_nodes = names
|
||||
.Where(n => !string.IsNullOrWhiteSpace(n))
|
||||
.Select(n => n.Trim())
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.Select(n => new NodeEntry { Name = n })
|
||||
.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the total number of configured cluster nodes. Stable — nodes are never added
|
||||
/// or removed after construction.
|
||||
/// </summary>
|
||||
public int NodeCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
return _nodes.Count;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns an ordered snapshot of nodes currently eligible for a connection attempt,
|
||||
/// with any node whose cooldown has elapsed automatically restored to the pool.
|
||||
/// An empty list means all nodes are in active cooldown.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> GetHealthyNodes()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var now = _clock();
|
||||
return _nodes
|
||||
.Where(n => IsHealthyAt(n, now))
|
||||
.Select(n => n.Name)
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of nodes currently eligible for a connection attempt (i.e., not in cooldown).
|
||||
/// </summary>
|
||||
public int HealthyNodeCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var now = _clock();
|
||||
return _nodes.Count(n => IsHealthyAt(n, now));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Places <paramref name="node"/> into cooldown starting at the current clock time.
|
||||
/// Increments the node's failure counter and stores the latest error message for
|
||||
/// surfacing on the dashboard. Unknown node names are ignored.
|
||||
/// </summary>
|
||||
public void MarkFailed(string node, string? error)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var entry = FindEntry(node);
|
||||
if (entry == null)
|
||||
return;
|
||||
|
||||
var now = _clock();
|
||||
entry.FailureCount++;
|
||||
entry.LastError = error;
|
||||
entry.LastFailureTime = now;
|
||||
entry.CooldownUntil = _cooldown.TotalMilliseconds > 0 ? now + _cooldown : (DateTime?)null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks <paramref name="node"/> as healthy immediately — clears any active cooldown but
|
||||
/// leaves the cumulative failure counter intact for operator diagnostics. Unknown node
|
||||
/// names are ignored.
|
||||
/// </summary>
|
||||
public void MarkHealthy(string node)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var entry = FindEntry(node);
|
||||
if (entry == null)
|
||||
return;
|
||||
entry.CooldownUntil = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Captures the current per-node state for the health dashboard. Freshly computed from
|
||||
/// <see cref="_clock"/> so recently-expired cooldowns are reported as healthy.
|
||||
/// </summary>
|
||||
public List<HistorianClusterNodeState> SnapshotNodeStates()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var now = _clock();
|
||||
return _nodes.Select(n => new HistorianClusterNodeState
|
||||
{
|
||||
Name = n.Name,
|
||||
IsHealthy = IsHealthyAt(n, now),
|
||||
CooldownUntil = IsHealthyAt(n, now) ? null : n.CooldownUntil,
|
||||
FailureCount = n.FailureCount,
|
||||
LastError = n.LastError,
|
||||
LastFailureTime = n.LastFailureTime
|
||||
}).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsHealthyAt(NodeEntry entry, DateTime now)
|
||||
{
|
||||
return entry.CooldownUntil == null || entry.CooldownUntil <= now;
|
||||
}
|
||||
|
||||
private NodeEntry? FindEntry(string node)
|
||||
{
|
||||
for (var i = 0; i < _nodes.Count; i++)
|
||||
if (string.Equals(_nodes[i].Name, node, StringComparison.OrdinalIgnoreCase))
|
||||
return _nodes[i];
|
||||
return null;
|
||||
}
|
||||
|
||||
private sealed class NodeEntry
|
||||
{
|
||||
public string Name { get; set; } = "";
|
||||
public DateTime? CooldownUntil { get; set; }
|
||||
public int FailureCount { get; set; }
|
||||
public string? LastError { get; set; }
|
||||
public DateTime? LastFailureTime { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -27,20 +27,155 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
private HistorianAccess? _eventConnection;
|
||||
private bool _disposed;
|
||||
|
||||
// Runtime query health state. Guarded by _healthLock — updated on every read
|
||||
// method exit (success or failure) so the dashboard can distinguish "plugin
|
||||
// loaded but never queried" from "plugin loaded and queries are failing".
|
||||
private readonly object _healthLock = new object();
|
||||
private long _totalSuccesses;
|
||||
private long _totalFailures;
|
||||
private int _consecutiveFailures;
|
||||
private DateTime? _lastSuccessTime;
|
||||
private DateTime? _lastFailureTime;
|
||||
private string? _lastError;
|
||||
private string? _activeProcessNode;
|
||||
private string? _activeEventNode;
|
||||
|
||||
// Cluster endpoint picker — shared across process + event paths so a node that
|
||||
// fails on one silo is skipped on the other. Initialized from config at construction.
|
||||
private readonly HistorianClusterEndpointPicker _picker;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a Historian reader that translates OPC UA history requests into Wonderware Historian SDK queries.
|
||||
/// </summary>
|
||||
/// <param name="config">The Historian SDK connection settings used for runtime history lookups.</param>
|
||||
public HistorianDataSource(HistorianConfiguration config)
|
||||
: this(config, new SdkHistorianConnectionFactory()) { }
|
||||
: this(config, new SdkHistorianConnectionFactory(), null) { }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a Historian reader with a custom connection factory for testing.
|
||||
/// Initializes a Historian reader with a custom connection factory for testing. When
|
||||
/// <paramref name="picker"/> is <see langword="null"/> a new picker is built from
|
||||
/// <paramref name="config"/>, preserving backward compatibility with existing tests.
|
||||
/// </summary>
|
||||
internal HistorianDataSource(HistorianConfiguration config, IHistorianConnectionFactory factory)
|
||||
internal HistorianDataSource(
|
||||
HistorianConfiguration config,
|
||||
IHistorianConnectionFactory factory,
|
||||
HistorianClusterEndpointPicker? picker = null)
|
||||
{
|
||||
_config = config;
|
||||
_factory = factory;
|
||||
_picker = picker ?? new HistorianClusterEndpointPicker(config);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Iterates the picker's healthy node list, cloning the configuration per attempt and
|
||||
/// handing it to the factory. Marks each tried node as healthy on success or failed on
|
||||
/// exception. Returns the winning connection + node name; throws when no nodes succeed.
|
||||
/// </summary>
|
||||
private (HistorianAccess Connection, string Node) ConnectToAnyHealthyNode(HistorianConnectionType type)
|
||||
{
|
||||
var candidates = _picker.GetHealthyNodes();
|
||||
if (candidates.Count == 0)
|
||||
{
|
||||
var total = _picker.NodeCount;
|
||||
throw new InvalidOperationException(
|
||||
total == 0
|
||||
? "No historian nodes configured"
|
||||
: $"All {total} historian nodes are in cooldown — no healthy endpoints to connect to");
|
||||
}
|
||||
|
||||
Exception? lastException = null;
|
||||
foreach (var node in candidates)
|
||||
{
|
||||
var attemptConfig = CloneConfigWithServerName(node);
|
||||
try
|
||||
{
|
||||
var conn = _factory.CreateAndConnect(attemptConfig, type);
|
||||
_picker.MarkHealthy(node);
|
||||
return (conn, node);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_picker.MarkFailed(node, ex.Message);
|
||||
lastException = ex;
|
||||
Log.Warning(ex,
|
||||
"Historian node {Node} failed during connect attempt; trying next candidate", node);
|
||||
}
|
||||
}
|
||||
|
||||
var inner = lastException?.Message ?? "(no detail)";
|
||||
throw new InvalidOperationException(
|
||||
$"All {candidates.Count} healthy historian candidate(s) failed during connect: {inner}",
|
||||
lastException);
|
||||
}
|
||||
|
||||
private HistorianConfiguration CloneConfigWithServerName(string serverName)
|
||||
{
|
||||
return new HistorianConfiguration
|
||||
{
|
||||
Enabled = _config.Enabled,
|
||||
ServerName = serverName,
|
||||
ServerNames = _config.ServerNames,
|
||||
FailureCooldownSeconds = _config.FailureCooldownSeconds,
|
||||
IntegratedSecurity = _config.IntegratedSecurity,
|
||||
UserName = _config.UserName,
|
||||
Password = _config.Password,
|
||||
Port = _config.Port,
|
||||
CommandTimeoutSeconds = _config.CommandTimeoutSeconds,
|
||||
MaxValuesPerRead = _config.MaxValuesPerRead
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public HistorianHealthSnapshot GetHealthSnapshot()
|
||||
{
|
||||
var nodeStates = _picker.SnapshotNodeStates();
|
||||
var healthyCount = 0;
|
||||
foreach (var n in nodeStates)
|
||||
if (n.IsHealthy)
|
||||
healthyCount++;
|
||||
|
||||
lock (_healthLock)
|
||||
{
|
||||
return new HistorianHealthSnapshot
|
||||
{
|
||||
TotalQueries = _totalSuccesses + _totalFailures,
|
||||
TotalSuccesses = _totalSuccesses,
|
||||
TotalFailures = _totalFailures,
|
||||
ConsecutiveFailures = _consecutiveFailures,
|
||||
LastSuccessTime = _lastSuccessTime,
|
||||
LastFailureTime = _lastFailureTime,
|
||||
LastError = _lastError,
|
||||
ProcessConnectionOpen = Volatile.Read(ref _connection) != null,
|
||||
EventConnectionOpen = Volatile.Read(ref _eventConnection) != null,
|
||||
ActiveProcessNode = _activeProcessNode,
|
||||
ActiveEventNode = _activeEventNode,
|
||||
NodeCount = nodeStates.Count,
|
||||
HealthyNodeCount = healthyCount,
|
||||
Nodes = nodeStates
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordSuccess()
|
||||
{
|
||||
lock (_healthLock)
|
||||
{
|
||||
_totalSuccesses++;
|
||||
_lastSuccessTime = DateTime.UtcNow;
|
||||
_consecutiveFailures = 0;
|
||||
_lastError = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordFailure(string error)
|
||||
{
|
||||
lock (_healthLock)
|
||||
{
|
||||
_totalFailures++;
|
||||
_lastFailureTime = DateTime.UtcNow;
|
||||
_consecutiveFailures++;
|
||||
_lastError = error;
|
||||
}
|
||||
}
|
||||
|
||||
private void EnsureConnected()
|
||||
@@ -53,8 +188,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
return;
|
||||
|
||||
// Create and wait for connection outside the lock so concurrent history
|
||||
// requests are not serialized behind a slow Historian handshake.
|
||||
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Process);
|
||||
// requests are not serialized behind a slow Historian handshake. The cluster
|
||||
// picker iterates configured nodes and returns the first that successfully connects.
|
||||
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Process);
|
||||
|
||||
lock (_connectionLock)
|
||||
{
|
||||
@@ -74,7 +210,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
_connection = conn;
|
||||
Log.Information("Historian SDK connection opened to {Server}:{Port}", _config.ServerName, _config.Port);
|
||||
lock (_healthLock)
|
||||
_activeProcessNode = winningNode;
|
||||
Log.Information("Historian SDK connection opened to {Server}:{Port}", winningNode, _config.Port);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -96,7 +234,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
_connection = null;
|
||||
Log.Warning(ex, "Historian SDK connection reset — will reconnect on next request");
|
||||
string? failedNode;
|
||||
lock (_healthLock)
|
||||
{
|
||||
failedNode = _activeProcessNode;
|
||||
_activeProcessNode = null;
|
||||
}
|
||||
|
||||
if (failedNode != null)
|
||||
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
|
||||
Log.Warning(ex, "Historian SDK connection reset (node={Node}) — will reconnect on next request",
|
||||
failedNode ?? "(unknown)");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,7 +256,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
if (Volatile.Read(ref _eventConnection) != null)
|
||||
return;
|
||||
|
||||
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Event);
|
||||
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Event);
|
||||
|
||||
lock (_eventConnectionLock)
|
||||
{
|
||||
@@ -127,8 +275,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
_eventConnection = conn;
|
||||
lock (_healthLock)
|
||||
_activeEventNode = winningNode;
|
||||
Log.Information("Historian SDK event connection opened to {Server}:{Port}",
|
||||
_config.ServerName, _config.Port);
|
||||
winningNode, _config.Port);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,7 +300,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
_eventConnection = null;
|
||||
Log.Warning(ex, "Historian SDK event connection reset — will reconnect on next request");
|
||||
string? failedNode;
|
||||
lock (_healthLock)
|
||||
{
|
||||
failedNode = _activeEventNode;
|
||||
_activeEventNode = null;
|
||||
}
|
||||
|
||||
if (failedNode != null)
|
||||
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
|
||||
Log.Warning(ex, "Historian SDK event connection reset (node={Node}) — will reconnect on next request",
|
||||
failedNode ?? "(unknown)");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,6 +343,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
if (!query.StartQuery(args, out var error))
|
||||
{
|
||||
Log.Warning("Historian SDK raw query start failed for {Tag}: {Error}", tagName, error.ErrorCode);
|
||||
RecordFailure($"raw StartQuery: {error.ErrorCode}");
|
||||
HandleConnectionError();
|
||||
return Task.FromResult(results);
|
||||
}
|
||||
@@ -219,6 +380,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
query.EndQuery(out _);
|
||||
RecordSuccess();
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
@@ -231,6 +393,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "HistoryRead raw failed for {Tag}", tagName);
|
||||
RecordFailure($"raw: {ex.Message}");
|
||||
HandleConnectionError(ex);
|
||||
}
|
||||
|
||||
@@ -265,6 +428,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
{
|
||||
Log.Warning("Historian SDK aggregate query start failed for {Tag}: {Error}", tagName,
|
||||
error.ErrorCode);
|
||||
RecordFailure($"aggregate StartQuery: {error.ErrorCode}");
|
||||
HandleConnectionError();
|
||||
return Task.FromResult(results);
|
||||
}
|
||||
@@ -287,6 +451,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
query.EndQuery(out _);
|
||||
RecordSuccess();
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
@@ -299,6 +464,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "HistoryRead aggregate failed for {Tag}", tagName);
|
||||
RecordFailure($"aggregate: {ex.Message}");
|
||||
HandleConnectionError(ex);
|
||||
}
|
||||
|
||||
@@ -380,6 +546,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
|
||||
query.EndQuery(out _);
|
||||
}
|
||||
RecordSuccess();
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
@@ -392,6 +559,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "HistoryRead at-time failed for {Tag}", tagName);
|
||||
RecordFailure($"at-time: {ex.Message}");
|
||||
HandleConnectionError(ex);
|
||||
}
|
||||
|
||||
@@ -430,6 +598,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
if (!query.StartQuery(args, out var error))
|
||||
{
|
||||
Log.Warning("Historian SDK event query start failed: {Error}", error.ErrorCode);
|
||||
RecordFailure($"events StartQuery: {error.ErrorCode}");
|
||||
HandleEventConnectionError();
|
||||
return Task.FromResult(results);
|
||||
}
|
||||
@@ -445,6 +614,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
}
|
||||
|
||||
query.EndQuery(out _);
|
||||
RecordSuccess();
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
@@ -457,6 +627,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "HistoryRead events failed for source {Source}", sourceName ?? "(all)");
|
||||
RecordFailure($"events: {ex.Message}");
|
||||
HandleEventConnectionError(ex);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Data.SqlClient;
|
||||
using System.Linq;
|
||||
using Opc.Ua;
|
||||
@@ -127,20 +128,39 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
Log.Warning("Only the 'None' security profile is configured — transport security is disabled");
|
||||
|
||||
// Historian
|
||||
Log.Information("Historian.Enabled={Enabled}, ServerName={ServerName}, IntegratedSecurity={IntegratedSecurity}, Port={Port}",
|
||||
config.Historian.Enabled, config.Historian.ServerName, config.Historian.IntegratedSecurity,
|
||||
var clusterNodes = config.Historian.ServerNames ?? new List<string>();
|
||||
var effectiveNodes = clusterNodes.Count > 0
|
||||
? string.Join(",", clusterNodes)
|
||||
: config.Historian.ServerName;
|
||||
Log.Information(
|
||||
"Historian.Enabled={Enabled}, Nodes=[{Nodes}], IntegratedSecurity={IntegratedSecurity}, Port={Port}",
|
||||
config.Historian.Enabled, effectiveNodes, config.Historian.IntegratedSecurity,
|
||||
config.Historian.Port);
|
||||
Log.Information("Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}",
|
||||
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead);
|
||||
Log.Information(
|
||||
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}",
|
||||
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead,
|
||||
config.Historian.FailureCooldownSeconds);
|
||||
|
||||
if (config.Historian.Enabled)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(config.Historian.ServerName))
|
||||
if (clusterNodes.Count == 0 && string.IsNullOrWhiteSpace(config.Historian.ServerName))
|
||||
{
|
||||
Log.Error("Historian.ServerName must not be empty when Historian is enabled");
|
||||
Log.Error("Historian.ServerName (or ServerNames) must not be empty when Historian is enabled");
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if (config.Historian.FailureCooldownSeconds < 0)
|
||||
{
|
||||
Log.Error("Historian.FailureCooldownSeconds must be zero or positive");
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if (clusterNodes.Count > 0 && !string.IsNullOrWhiteSpace(config.Historian.ServerName)
|
||||
&& config.Historian.ServerName != "localhost")
|
||||
Log.Warning(
|
||||
"Historian.ServerName='{ServerName}' is ignored because Historian.ServerNames has {Count} entries",
|
||||
config.Historian.ServerName, clusterNodes.Count);
|
||||
|
||||
if (config.Historian.Port < 1 || config.Historian.Port > 65535)
|
||||
{
|
||||
Log.Error("Historian.Port must be between 1 and 65535");
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
{
|
||||
/// <summary>
|
||||
@@ -11,10 +13,25 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
public bool Enabled { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Historian server hostname.
|
||||
/// Gets or sets the single Historian server hostname used when <see cref="ServerNames"/>
|
||||
/// is empty. Preserved for backward compatibility with pre-cluster deployments.
|
||||
/// </summary>
|
||||
public string ServerName { get; set; } = "localhost";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the ordered list of Historian cluster nodes. When non-empty, this list
|
||||
/// supersedes <see cref="ServerName"/>: the data source attempts each node in order on
|
||||
/// connect, falling through to the next on failure. A failed node is placed in cooldown
|
||||
/// for <see cref="FailureCooldownSeconds"/> before being re-eligible.
|
||||
/// </summary>
|
||||
public List<string> ServerNames { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the cooldown window, in seconds, that a historian node is skipped after
|
||||
/// a connection failure. A value of zero retries the node on every request. Default 60s.
|
||||
/// </summary>
|
||||
public int FailureCooldownSeconds { get; set; } = 60;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether Windows Integrated Security is used.
|
||||
/// When false, <see cref="UserName"/> and <see cref="Password"/> are used instead.
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
using System;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
|
||||
{
|
||||
/// <summary>
|
||||
/// Point-in-time state of a single historian cluster node. One entry per configured node is
|
||||
/// surfaced inside <see cref="HistorianHealthSnapshot"/> so the status dashboard can render
|
||||
/// per-node health and operators can see which nodes are in cooldown.
|
||||
/// </summary>
|
||||
public sealed class HistorianClusterNodeState
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the configured node hostname exactly as it appears in
|
||||
/// <c>HistorianConfiguration.ServerNames</c>.
|
||||
/// </summary>
|
||||
public string Name { get; set; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the node is currently eligible for new connection
|
||||
/// attempts. <see langword="false"/> means the node is in its post-failure cooldown window
|
||||
/// and the picker is skipping it.
|
||||
/// </summary>
|
||||
public bool IsHealthy { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp at which the node's cooldown expires, or
|
||||
/// <see langword="null"/> when the node is not in cooldown.
|
||||
/// </summary>
|
||||
public DateTime? CooldownUntil { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of times this node has transitioned from healthy to failed
|
||||
/// since startup. Does not decrement on recovery.
|
||||
/// </summary>
|
||||
public int FailureCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the message from the most recent failure, or <see langword="null"/> when
|
||||
/// the node has never failed.
|
||||
/// </summary>
|
||||
public string? LastError { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the most recent failure, or <see langword="null"/>
|
||||
/// when the node has never failed.
|
||||
/// </summary>
|
||||
public DateTime? LastFailureTime { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
|
||||
{
|
||||
/// <summary>
|
||||
/// Point-in-time runtime health of the historian plugin, surfaced to the status dashboard
|
||||
/// and health check service. Fills the gap between the load-time plugin status
|
||||
/// (<see cref="HistorianPluginLoader.LastOutcome"/>) and actual query behavior so operators
|
||||
/// can detect silent query degradation.
|
||||
/// </summary>
|
||||
public sealed class HistorianHealthSnapshot
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of historian read operations attempted since startup
|
||||
/// across all read paths (raw, aggregate, at-time, events).
|
||||
/// </summary>
|
||||
public long TotalQueries { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of read operations that completed without an exception
|
||||
/// being caught by the plugin's error handler. Includes empty result sets as successes —
|
||||
/// the counter reflects "the SDK call returned" not "the SDK call returned data".
|
||||
/// </summary>
|
||||
public long TotalSuccesses { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of read operations that raised an exception. Each failure
|
||||
/// also resets and closes the underlying SDK connection via the existing reconnect path.
|
||||
/// </summary>
|
||||
public long TotalFailures { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of consecutive failures since the last success. Latches until
|
||||
/// a successful query clears it. The health check service uses this as a degradation signal.
|
||||
/// </summary>
|
||||
public int ConsecutiveFailures { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the last successful read, or <see langword="null"/>
|
||||
/// when no query has succeeded since startup.
|
||||
/// </summary>
|
||||
public DateTime? LastSuccessTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the last failure, or <see langword="null"/> when no
|
||||
/// query has failed since startup.
|
||||
/// </summary>
|
||||
public DateTime? LastFailureTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the exception message from the most recent failure. Cleared on the next
|
||||
/// successful query.
|
||||
/// </summary>
|
||||
public string? LastError { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the plugin currently holds an open SDK
|
||||
/// connection for the process (historical values) path.
|
||||
/// </summary>
|
||||
public bool ProcessConnectionOpen { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the plugin currently holds an open SDK
|
||||
/// connection for the event (alarm history) path.
|
||||
/// </summary>
|
||||
public bool EventConnectionOpen { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the node the plugin is currently connected to for the process path,
|
||||
/// or <see langword="null"/> when no connection is open.
|
||||
/// </summary>
|
||||
public string? ActiveProcessNode { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the node the plugin is currently connected to for the event path,
|
||||
/// or <see langword="null"/> when no event connection is open.
|
||||
/// </summary>
|
||||
public string? ActiveEventNode { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of configured historian cluster nodes. A value of 1
|
||||
/// reflects a legacy single-node deployment.
|
||||
/// </summary>
|
||||
public int NodeCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of configured nodes that are currently healthy (not in cooldown).
|
||||
/// </summary>
|
||||
public int HealthyNodeCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the per-node cluster state in configuration order.
|
||||
/// </summary>
|
||||
public List<HistorianClusterNodeState> Nodes { get; set; } = new();
|
||||
}
|
||||
}
|
||||
@@ -29,5 +29,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Historian
|
||||
Task<List<HistorianEventDto>> ReadEventsAsync(
|
||||
string? sourceName, DateTime startTime, DateTime endTime, int maxEvents,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Returns a runtime snapshot of query success/failure counters and connection state.
|
||||
/// Consumed by the status dashboard and health check service so operators can detect
|
||||
/// silent query degradation that the load-time plugin status can't catch.
|
||||
/// </summary>
|
||||
HistorianHealthSnapshot GetHealthSnapshot();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,6 +190,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
public IReadOnlyList<string> AlarmFilterPatterns =>
|
||||
_alarmObjectFilter?.RawPatterns ?? Array.Empty<string>();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the runtime historian health snapshot, or <see langword="null"/> when the historian
|
||||
/// plugin is not loaded. Surfaced on the status dashboard so operators can detect query
|
||||
/// failures that the load-time plugin status cannot catch.
|
||||
/// </summary>
|
||||
public HistorianHealthSnapshot? HistorianHealth => _historianDataSource?.GetHealthSnapshot();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of distinct alarm conditions currently tracked (one per alarm attribute).
|
||||
/// </summary>
|
||||
|
||||
@@ -42,6 +42,33 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
Color = "yellow"
|
||||
};
|
||||
|
||||
// Rule 2b2: Historian plugin loaded but queries are failing consecutively → Degraded.
|
||||
// Threshold of 3 avoids flagging a single transient blip; anything beyond that means
|
||||
// the SDK is in a broken state that the reconnect loop isn't recovering from.
|
||||
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
|
||||
&& historian.ConsecutiveFailures >= 3)
|
||||
return new HealthInfo
|
||||
{
|
||||
Status = "Degraded",
|
||||
Message =
|
||||
$"Historian plugin has {historian.ConsecutiveFailures} consecutive query failures: " +
|
||||
$"{historian.LastQueryError ?? "(no error)"}",
|
||||
Color = "yellow"
|
||||
};
|
||||
|
||||
// Rule 2b3: Historian cluster has nodes in cooldown → Degraded (partial cluster).
|
||||
// Only surfaces when the operator actually configured a multi-node cluster.
|
||||
if (historian != null && historian.Enabled && historian.PluginStatus == "Loaded"
|
||||
&& historian.NodeCount > 1 && historian.HealthyNodeCount < historian.NodeCount)
|
||||
return new HealthInfo
|
||||
{
|
||||
Status = "Degraded",
|
||||
Message =
|
||||
$"Historian cluster has {historian.HealthyNodeCount} of {historian.NodeCount} " +
|
||||
"nodes healthy — one or more nodes are in failure cooldown",
|
||||
Color = "yellow"
|
||||
};
|
||||
|
||||
// Rule 2 / 2c: Success rate too low for any recorded operation
|
||||
if (metrics != null)
|
||||
{
|
||||
|
||||
@@ -257,6 +257,81 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
/// Gets or sets the configured historian TCP port.
|
||||
/// </summary>
|
||||
public int Port { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of historian read queries attempted since startup.
|
||||
/// </summary>
|
||||
public long QueryTotal { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of historian queries that completed without an exception.
|
||||
/// </summary>
|
||||
public long QuerySuccesses { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of historian queries that raised an exception.
|
||||
/// </summary>
|
||||
public long QueryFailures { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of consecutive failures since the last successful query.
|
||||
/// </summary>
|
||||
public int ConsecutiveFailures { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the last successful query.
|
||||
/// </summary>
|
||||
public DateTime? LastSuccessTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the last query failure.
|
||||
/// </summary>
|
||||
public DateTime? LastFailureTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the exception message from the most recent failure.
|
||||
/// </summary>
|
||||
public string? LastQueryError { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the plugin currently holds an open process-path
|
||||
/// SDK connection.
|
||||
/// </summary>
|
||||
public bool ProcessConnectionOpen { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the plugin currently holds an open event-path
|
||||
/// SDK connection.
|
||||
/// </summary>
|
||||
public bool EventConnectionOpen { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of configured historian cluster nodes.
|
||||
/// </summary>
|
||||
public int NodeCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the number of cluster nodes currently eligible for new connections
|
||||
/// (i.e., not in failure cooldown).
|
||||
/// </summary>
|
||||
public int HealthyNodeCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the node currently serving process (historical value) queries, or null
|
||||
/// when no process connection is open.
|
||||
/// </summary>
|
||||
public string? ActiveProcessNode { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the node currently serving event (alarm history) queries, or null when
|
||||
/// no event connection is open.
|
||||
/// </summary>
|
||||
public string? ActiveEventNode { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the per-node cluster state in configuration order.
|
||||
/// </summary>
|
||||
public List<Historian.HistorianClusterNodeState> Nodes { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -125,6 +125,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
private HistorianStatusInfo BuildHistorianStatusInfo()
|
||||
{
|
||||
var outcome = HistorianPluginLoader.LastOutcome;
|
||||
var health = _nodeManager?.HistorianHealth;
|
||||
return new HistorianStatusInfo
|
||||
{
|
||||
Enabled = _historianConfig?.Enabled ?? false,
|
||||
@@ -132,7 +133,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
PluginError = outcome.Error,
|
||||
PluginPath = outcome.PluginPath,
|
||||
ServerName = _historianConfig?.ServerName ?? "",
|
||||
Port = _historianConfig?.Port ?? 0
|
||||
Port = _historianConfig?.Port ?? 0,
|
||||
QueryTotal = health?.TotalQueries ?? 0,
|
||||
QuerySuccesses = health?.TotalSuccesses ?? 0,
|
||||
QueryFailures = health?.TotalFailures ?? 0,
|
||||
ConsecutiveFailures = health?.ConsecutiveFailures ?? 0,
|
||||
LastSuccessTime = health?.LastSuccessTime,
|
||||
LastFailureTime = health?.LastFailureTime,
|
||||
LastQueryError = health?.LastError,
|
||||
ProcessConnectionOpen = health?.ProcessConnectionOpen ?? false,
|
||||
EventConnectionOpen = health?.EventConnectionOpen ?? false,
|
||||
NodeCount = health?.NodeCount ?? 0,
|
||||
HealthyNodeCount = health?.HealthyNodeCount ?? 0,
|
||||
ActiveProcessNode = health?.ActiveProcessNode,
|
||||
ActiveEventNode = health?.ActiveEventNode,
|
||||
Nodes = health?.Nodes ?? new List<Historian.HistorianClusterNodeState>()
|
||||
};
|
||||
}
|
||||
|
||||
@@ -304,13 +319,66 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Historian panel
|
||||
var histColor = data.Historian.PluginStatus == "Loaded" ? "green"
|
||||
: !data.Historian.Enabled ? "gray" : "red";
|
||||
var anyClusterNodeFailed =
|
||||
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount;
|
||||
var allClusterNodesFailed =
|
||||
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount == 0;
|
||||
var histColor = !data.Historian.Enabled ? "gray"
|
||||
: data.Historian.PluginStatus != "Loaded" ? "red"
|
||||
: allClusterNodesFailed ? "red"
|
||||
: data.Historian.ConsecutiveFailures >= 5 ? "red"
|
||||
: anyClusterNodeFailed || data.Historian.ConsecutiveFailures > 0 ? "yellow"
|
||||
: "green";
|
||||
sb.AppendLine($"<div class='panel {histColor}'><h2>Historian</h2>");
|
||||
sb.AppendLine(
|
||||
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Server: {WebUtility.HtmlEncode(data.Historian.ServerName)}:{data.Historian.Port}</p>");
|
||||
$"<p>Enabled: <b>{data.Historian.Enabled}</b> | Plugin: <b>{data.Historian.PluginStatus}</b> | Port: {data.Historian.Port}</p>");
|
||||
if (!string.IsNullOrEmpty(data.Historian.PluginError))
|
||||
sb.AppendLine($"<p>Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
|
||||
sb.AppendLine($"<p>Plugin Error: {WebUtility.HtmlEncode(data.Historian.PluginError)}</p>");
|
||||
if (data.Historian.PluginStatus == "Loaded")
|
||||
{
|
||||
sb.AppendLine(
|
||||
$"<p>Queries: <b>{data.Historian.QueryTotal:N0}</b> " +
|
||||
$"(Success: {data.Historian.QuerySuccesses:N0}, Failure: {data.Historian.QueryFailures:N0}) " +
|
||||
$"| Consecutive Failures: <b>{data.Historian.ConsecutiveFailures}</b></p>");
|
||||
var procBadge = data.Historian.ProcessConnectionOpen
|
||||
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveProcessNode ?? "?")})"
|
||||
: "closed";
|
||||
var evtBadge = data.Historian.EventConnectionOpen
|
||||
? $"open ({WebUtility.HtmlEncode(data.Historian.ActiveEventNode ?? "?")})"
|
||||
: "closed";
|
||||
sb.AppendLine(
|
||||
$"<p>Process Conn: <b>{procBadge}</b> | Event Conn: <b>{evtBadge}</b></p>");
|
||||
if (data.Historian.LastSuccessTime.HasValue)
|
||||
sb.AppendLine($"<p>Last Success: {data.Historian.LastSuccessTime:O}</p>");
|
||||
if (data.Historian.LastFailureTime.HasValue)
|
||||
sb.AppendLine($"<p>Last Failure: {data.Historian.LastFailureTime:O}</p>");
|
||||
if (!string.IsNullOrEmpty(data.Historian.LastQueryError))
|
||||
sb.AppendLine(
|
||||
$"<p>Last Error: <code>{WebUtility.HtmlEncode(data.Historian.LastQueryError)}</code></p>");
|
||||
|
||||
// Cluster table: only when a true multi-node cluster is configured.
|
||||
if (data.Historian.NodeCount > 1)
|
||||
{
|
||||
sb.AppendLine(
|
||||
$"<p><b>Cluster:</b> {data.Historian.HealthyNodeCount} of {data.Historian.NodeCount} nodes healthy</p>");
|
||||
sb.AppendLine(
|
||||
"<table><tr><th>Node</th><th>State</th><th>Cooldown Until</th><th>Failures</th><th>Last Error</th></tr>");
|
||||
foreach (var node in data.Historian.Nodes)
|
||||
{
|
||||
var state = node.IsHealthy ? "healthy" : "cooldown";
|
||||
var cooldown = node.CooldownUntil?.ToString("O") ?? "-";
|
||||
var lastErr = WebUtility.HtmlEncode(node.LastError ?? "");
|
||||
sb.AppendLine(
|
||||
$"<tr><td>{WebUtility.HtmlEncode(node.Name)}</td><td>{state}</td>" +
|
||||
$"<td>{cooldown}</td><td>{node.FailureCount}</td><td><code>{lastErr}</code></td></tr>");
|
||||
}
|
||||
sb.AppendLine("</table>");
|
||||
}
|
||||
else if (data.Historian.NodeCount == 1)
|
||||
{
|
||||
sb.AppendLine($"<p>Node: {WebUtility.HtmlEncode(data.Historian.Nodes[0].Name)}</p>");
|
||||
}
|
||||
}
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Alarms panel
|
||||
|
||||
@@ -75,6 +75,8 @@
|
||||
"Historian": {
|
||||
"Enabled": false,
|
||||
"ServerName": "localhost",
|
||||
"ServerNames": [],
|
||||
"FailureCooldownSeconds": 60,
|
||||
"IntegratedSecurity": true,
|
||||
"UserName": null,
|
||||
"Password": null,
|
||||
|
||||
Reference in New Issue
Block a user