Instrument the historian plugin with runtime query health counters and read-only cluster failover so operators can detect silent query degradation and keep serving history when a single cluster node goes down

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 14:08:32 -04:00
parent 4fe37fd1b7
commit 8f340553d9
20 changed files with 1526 additions and 32 deletions

View File

@@ -0,0 +1,181 @@
using System;
using System.Collections.Generic;
using System.Linq;
using ZB.MOM.WW.LmxOpcUa.Host.Configuration;
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
{
/// <summary>
/// Thread-safe, pure-logic endpoint picker for the Wonderware Historian cluster. Tracks which
/// configured nodes are healthy, places failed nodes in a time-bounded cooldown, and hands
/// out an ordered list of eligible candidates for the data source to try in sequence.
/// </summary>
/// <remarks>
/// Design notes:
/// <list type="bullet">
/// <item>No SDK dependency — fully unit-testable with an injected clock.</item>
/// <item>Per-node state is guarded by a single lock; operations are microsecond-scale
/// so contention is a non-issue.</item>
/// <item>Cooldown is purely passive: a node re-enters the healthy pool the next time
/// it is queried after its cooldown window elapses. There is no background probe.</item>
/// <item>Nodes are returned in configuration order so operators can express a
/// preference (primary first, fallback second).</item>
/// <item>When <see cref="HistorianConfiguration.ServerNames"/> is empty, the picker is
/// initialized with a single entry from <see cref="HistorianConfiguration.ServerName"/>
/// so legacy deployments continue to work unchanged.</item>
/// </list>
/// </remarks>
internal sealed class HistorianClusterEndpointPicker
{
private readonly Func<DateTime> _clock;
private readonly TimeSpan _cooldown;
private readonly object _lock = new object();
private readonly List<NodeEntry> _nodes;
public HistorianClusterEndpointPicker(HistorianConfiguration config)
: this(config, () => DateTime.UtcNow) { }
internal HistorianClusterEndpointPicker(HistorianConfiguration config, Func<DateTime> clock)
{
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
_cooldown = TimeSpan.FromSeconds(Math.Max(0, config.FailureCooldownSeconds));
var names = (config.ServerNames != null && config.ServerNames.Count > 0)
? config.ServerNames
: new List<string> { config.ServerName };
_nodes = names
.Where(n => !string.IsNullOrWhiteSpace(n))
.Select(n => n.Trim())
.Distinct(StringComparer.OrdinalIgnoreCase)
.Select(n => new NodeEntry { Name = n })
.ToList();
}
/// <summary>
/// Gets the total number of configured cluster nodes. Stable — nodes are never added
/// or removed after construction.
/// </summary>
public int NodeCount
{
get
{
lock (_lock)
return _nodes.Count;
}
}
/// <summary>
/// Returns an ordered snapshot of nodes currently eligible for a connection attempt,
/// with any node whose cooldown has elapsed automatically restored to the pool.
/// An empty list means all nodes are in active cooldown.
/// </summary>
public IReadOnlyList<string> GetHealthyNodes()
{
lock (_lock)
{
var now = _clock();
return _nodes
.Where(n => IsHealthyAt(n, now))
.Select(n => n.Name)
.ToList();
}
}
/// <summary>
/// Gets the count of nodes currently eligible for a connection attempt (i.e., not in cooldown).
/// </summary>
public int HealthyNodeCount
{
get
{
lock (_lock)
{
var now = _clock();
return _nodes.Count(n => IsHealthyAt(n, now));
}
}
}
/// <summary>
/// Places <paramref name="node"/> into cooldown starting at the current clock time.
/// Increments the node's failure counter and stores the latest error message for
/// surfacing on the dashboard. Unknown node names are ignored.
/// </summary>
public void MarkFailed(string node, string? error)
{
lock (_lock)
{
var entry = FindEntry(node);
if (entry == null)
return;
var now = _clock();
entry.FailureCount++;
entry.LastError = error;
entry.LastFailureTime = now;
entry.CooldownUntil = _cooldown.TotalMilliseconds > 0 ? now + _cooldown : (DateTime?)null;
}
}
/// <summary>
/// Marks <paramref name="node"/> as healthy immediately — clears any active cooldown but
/// leaves the cumulative failure counter intact for operator diagnostics. Unknown node
/// names are ignored.
/// </summary>
public void MarkHealthy(string node)
{
lock (_lock)
{
var entry = FindEntry(node);
if (entry == null)
return;
entry.CooldownUntil = null;
}
}
/// <summary>
/// Captures the current per-node state for the health dashboard. Freshly computed from
/// <see cref="_clock"/> so recently-expired cooldowns are reported as healthy.
/// </summary>
public List<HistorianClusterNodeState> SnapshotNodeStates()
{
lock (_lock)
{
var now = _clock();
return _nodes.Select(n => new HistorianClusterNodeState
{
Name = n.Name,
IsHealthy = IsHealthyAt(n, now),
CooldownUntil = IsHealthyAt(n, now) ? null : n.CooldownUntil,
FailureCount = n.FailureCount,
LastError = n.LastError,
LastFailureTime = n.LastFailureTime
}).ToList();
}
}
private static bool IsHealthyAt(NodeEntry entry, DateTime now)
{
return entry.CooldownUntil == null || entry.CooldownUntil <= now;
}
private NodeEntry? FindEntry(string node)
{
for (var i = 0; i < _nodes.Count; i++)
if (string.Equals(_nodes[i].Name, node, StringComparison.OrdinalIgnoreCase))
return _nodes[i];
return null;
}
private sealed class NodeEntry
{
public string Name { get; set; } = "";
public DateTime? CooldownUntil { get; set; }
public int FailureCount { get; set; }
public string? LastError { get; set; }
public DateTime? LastFailureTime { get; set; }
}
}
}

View File

@@ -27,20 +27,155 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
private HistorianAccess? _eventConnection;
private bool _disposed;
// Runtime query health state. Guarded by _healthLock — updated on every read
// method exit (success or failure) so the dashboard can distinguish "plugin
// loaded but never queried" from "plugin loaded and queries are failing".
private readonly object _healthLock = new object();
private long _totalSuccesses;
private long _totalFailures;
private int _consecutiveFailures;
private DateTime? _lastSuccessTime;
private DateTime? _lastFailureTime;
private string? _lastError;
private string? _activeProcessNode;
private string? _activeEventNode;
// Cluster endpoint picker — shared across process + event paths so a node that
// fails on one silo is skipped on the other. Initialized from config at construction.
private readonly HistorianClusterEndpointPicker _picker;
/// <summary>
/// Initializes a Historian reader that translates OPC UA history requests into Wonderware Historian SDK queries.
/// </summary>
/// <param name="config">The Historian SDK connection settings used for runtime history lookups.</param>
public HistorianDataSource(HistorianConfiguration config)
: this(config, new SdkHistorianConnectionFactory()) { }
: this(config, new SdkHistorianConnectionFactory(), null) { }
/// <summary>
/// Initializes a Historian reader with a custom connection factory for testing.
/// Initializes a Historian reader with a custom connection factory for testing. When
/// <paramref name="picker"/> is <see langword="null"/> a new picker is built from
/// <paramref name="config"/>, preserving backward compatibility with existing tests.
/// </summary>
internal HistorianDataSource(HistorianConfiguration config, IHistorianConnectionFactory factory)
internal HistorianDataSource(
HistorianConfiguration config,
IHistorianConnectionFactory factory,
HistorianClusterEndpointPicker? picker = null)
{
_config = config;
_factory = factory;
_picker = picker ?? new HistorianClusterEndpointPicker(config);
}
/// <summary>
/// Iterates the picker's healthy node list, cloning the configuration per attempt and
/// handing it to the factory. Marks each tried node as healthy on success or failed on
/// exception. Returns the winning connection + node name; throws when no nodes succeed.
/// </summary>
private (HistorianAccess Connection, string Node) ConnectToAnyHealthyNode(HistorianConnectionType type)
{
var candidates = _picker.GetHealthyNodes();
if (candidates.Count == 0)
{
var total = _picker.NodeCount;
throw new InvalidOperationException(
total == 0
? "No historian nodes configured"
: $"All {total} historian nodes are in cooldown — no healthy endpoints to connect to");
}
Exception? lastException = null;
foreach (var node in candidates)
{
var attemptConfig = CloneConfigWithServerName(node);
try
{
var conn = _factory.CreateAndConnect(attemptConfig, type);
_picker.MarkHealthy(node);
return (conn, node);
}
catch (Exception ex)
{
_picker.MarkFailed(node, ex.Message);
lastException = ex;
Log.Warning(ex,
"Historian node {Node} failed during connect attempt; trying next candidate", node);
}
}
var inner = lastException?.Message ?? "(no detail)";
throw new InvalidOperationException(
$"All {candidates.Count} healthy historian candidate(s) failed during connect: {inner}",
lastException);
}
private HistorianConfiguration CloneConfigWithServerName(string serverName)
{
return new HistorianConfiguration
{
Enabled = _config.Enabled,
ServerName = serverName,
ServerNames = _config.ServerNames,
FailureCooldownSeconds = _config.FailureCooldownSeconds,
IntegratedSecurity = _config.IntegratedSecurity,
UserName = _config.UserName,
Password = _config.Password,
Port = _config.Port,
CommandTimeoutSeconds = _config.CommandTimeoutSeconds,
MaxValuesPerRead = _config.MaxValuesPerRead
};
}
/// <inheritdoc />
public HistorianHealthSnapshot GetHealthSnapshot()
{
var nodeStates = _picker.SnapshotNodeStates();
var healthyCount = 0;
foreach (var n in nodeStates)
if (n.IsHealthy)
healthyCount++;
lock (_healthLock)
{
return new HistorianHealthSnapshot
{
TotalQueries = _totalSuccesses + _totalFailures,
TotalSuccesses = _totalSuccesses,
TotalFailures = _totalFailures,
ConsecutiveFailures = _consecutiveFailures,
LastSuccessTime = _lastSuccessTime,
LastFailureTime = _lastFailureTime,
LastError = _lastError,
ProcessConnectionOpen = Volatile.Read(ref _connection) != null,
EventConnectionOpen = Volatile.Read(ref _eventConnection) != null,
ActiveProcessNode = _activeProcessNode,
ActiveEventNode = _activeEventNode,
NodeCount = nodeStates.Count,
HealthyNodeCount = healthyCount,
Nodes = nodeStates
};
}
}
private void RecordSuccess()
{
lock (_healthLock)
{
_totalSuccesses++;
_lastSuccessTime = DateTime.UtcNow;
_consecutiveFailures = 0;
_lastError = null;
}
}
private void RecordFailure(string error)
{
lock (_healthLock)
{
_totalFailures++;
_lastFailureTime = DateTime.UtcNow;
_consecutiveFailures++;
_lastError = error;
}
}
private void EnsureConnected()
@@ -53,8 +188,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
return;
// Create and wait for connection outside the lock so concurrent history
// requests are not serialized behind a slow Historian handshake.
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Process);
// requests are not serialized behind a slow Historian handshake. The cluster
// picker iterates configured nodes and returns the first that successfully connects.
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Process);
lock (_connectionLock)
{
@@ -74,7 +210,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_connection = conn;
Log.Information("Historian SDK connection opened to {Server}:{Port}", _config.ServerName, _config.Port);
lock (_healthLock)
_activeProcessNode = winningNode;
Log.Information("Historian SDK connection opened to {Server}:{Port}", winningNode, _config.Port);
}
}
@@ -96,7 +234,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_connection = null;
Log.Warning(ex, "Historian SDK connection reset — will reconnect on next request");
string? failedNode;
lock (_healthLock)
{
failedNode = _activeProcessNode;
_activeProcessNode = null;
}
if (failedNode != null)
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
Log.Warning(ex, "Historian SDK connection reset (node={Node}) — will reconnect on next request",
failedNode ?? "(unknown)");
}
}
@@ -108,7 +256,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (Volatile.Read(ref _eventConnection) != null)
return;
var conn = _factory.CreateAndConnect(_config, HistorianConnectionType.Event);
var (conn, winningNode) = ConnectToAnyHealthyNode(HistorianConnectionType.Event);
lock (_eventConnectionLock)
{
@@ -127,8 +275,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_eventConnection = conn;
lock (_healthLock)
_activeEventNode = winningNode;
Log.Information("Historian SDK event connection opened to {Server}:{Port}",
_config.ServerName, _config.Port);
winningNode, _config.Port);
}
}
@@ -150,7 +300,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
_eventConnection = null;
Log.Warning(ex, "Historian SDK event connection reset — will reconnect on next request");
string? failedNode;
lock (_healthLock)
{
failedNode = _activeEventNode;
_activeEventNode = null;
}
if (failedNode != null)
_picker.MarkFailed(failedNode, ex?.Message ?? "mid-query failure");
Log.Warning(ex, "Historian SDK event connection reset (node={Node}) — will reconnect on next request",
failedNode ?? "(unknown)");
}
}
@@ -183,6 +343,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK raw query start failed for {Tag}: {Error}", tagName, error.ErrorCode);
RecordFailure($"raw StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
}
@@ -219,6 +380,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -231,6 +393,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead raw failed for {Tag}", tagName);
RecordFailure($"raw: {ex.Message}");
HandleConnectionError(ex);
}
@@ -265,6 +428,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
{
Log.Warning("Historian SDK aggregate query start failed for {Tag}: {Error}", tagName,
error.ErrorCode);
RecordFailure($"aggregate StartQuery: {error.ErrorCode}");
HandleConnectionError();
return Task.FromResult(results);
}
@@ -287,6 +451,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -299,6 +464,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead aggregate failed for {Tag}", tagName);
RecordFailure($"aggregate: {ex.Message}");
HandleConnectionError(ex);
}
@@ -380,6 +546,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
query.EndQuery(out _);
}
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -392,6 +559,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead at-time failed for {Tag}", tagName);
RecordFailure($"at-time: {ex.Message}");
HandleConnectionError(ex);
}
@@ -430,6 +598,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
if (!query.StartQuery(args, out var error))
{
Log.Warning("Historian SDK event query start failed: {Error}", error.ErrorCode);
RecordFailure($"events StartQuery: {error.ErrorCode}");
HandleEventConnectionError();
return Task.FromResult(results);
}
@@ -445,6 +614,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
}
query.EndQuery(out _);
RecordSuccess();
}
catch (OperationCanceledException)
{
@@ -457,6 +627,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva
catch (Exception ex)
{
Log.Warning(ex, "HistoryRead events failed for source {Source}", sourceName ?? "(all)");
RecordFailure($"events: {ex.Message}");
HandleEventConnectionError(ex);
}