lmxopcua/src/ZB.MOM.WW.OtOpcUa.Historian.Aveva/HistorianClusterEndpointPicker.cs

using System;
using System.Collections.Generic;
using System.Linq;
using ZB.MOM.WW.OtOpcUa.Host.Configuration;
using ZB.MOM.WW.OtOpcUa.Host.Historian;

namespace ZB.MOM.WW.OtOpcUa.Historian.Aveva
{
    /// <summary>
    ///     Thread-safe, pure-logic endpoint picker for the Wonderware Historian cluster. Tracks which
    ///     configured nodes are healthy, places failed nodes in a time-bounded cooldown, and hands
    ///     out an ordered list of eligible candidates for the data source to try in sequence.
    /// </summary>
    /// <remarks>
    ///     Design notes:
    ///     <list type="bullet">
    ///         <item>No SDK dependency — fully unit-testable with an injected clock.</item>
    ///         <item>Per-node state is guarded by a single lock; operations are microsecond-scale
    ///         so contention is a non-issue.</item>
    ///         <item>Cooldown is purely passive: a node re-enters the healthy pool the next time
    ///         it is queried after its cooldown window elapses. There is no background probe.</item>
    ///         <item>Nodes are returned in configuration order so operators can express a
    ///         preference (primary first, fallback second).</item>
    ///         <item>When <see cref="HistorianConfiguration.ServerNames"/> is empty, the picker is
    ///         initialized with a single entry from <see cref="HistorianConfiguration.ServerName"/>
    ///         so legacy deployments continue to work unchanged.</item>
    ///     </list>
    /// </remarks>
    internal sealed class HistorianClusterEndpointPicker
    {
        private readonly Func<DateTime> _clock;
        private readonly TimeSpan _cooldown;
        private readonly object _lock = new object();
        private readonly List<NodeEntry> _nodes;

        public HistorianClusterEndpointPicker(HistorianConfiguration config)
            : this(config, () => DateTime.UtcNow) { }

        internal HistorianClusterEndpointPicker(HistorianConfiguration config, Func<DateTime> clock)
        {
            _clock = clock ?? throw new ArgumentNullException(nameof(clock));
            _cooldown = TimeSpan.FromSeconds(Math.Max(0, config.FailureCooldownSeconds));

            var names = (config.ServerNames != null && config.ServerNames.Count > 0)
                ? config.ServerNames
                : new List<string> { config.ServerName };

            _nodes = names
                .Where(n => !string.IsNullOrWhiteSpace(n))
                .Select(n => n.Trim())
                .Distinct(StringComparer.OrdinalIgnoreCase)
                .Select(n => new NodeEntry { Name = n })
                .ToList();
        }

        /// <summary>
        ///     Gets the total number of configured cluster nodes. Stable — nodes are never added
        ///     or removed after construction.
        /// </summary>
        public int NodeCount
        {
            get
            {
                lock (_lock)
                    return _nodes.Count;
            }
        }

        /// <summary>
        ///     Returns an ordered snapshot of nodes currently eligible for a connection attempt,
        ///     with any node whose cooldown has elapsed automatically restored to the pool.
        ///     An empty list means all nodes are in active cooldown.
        /// </summary>
        public IReadOnlyList<string> GetHealthyNodes()
        {
            lock (_lock)
            {
                var now = _clock();
                return _nodes
                    .Where(n => IsHealthyAt(n, now))
                    .Select(n => n.Name)
                    .ToList();
            }
        }

        /// <summary>
        ///     Gets the count of nodes currently eligible for a connection attempt (i.e., not in cooldown).
        /// </summary>
        public int HealthyNodeCount
        {
            get
            {
                lock (_lock)
                {
                    var now = _clock();
                    return _nodes.Count(n => IsHealthyAt(n, now));
                }
            }
        }

        /// <summary>
        ///     Places <paramref name="node"/> into cooldown starting at the current clock time.
        ///     Increments the node's failure counter and stores the latest error message for
        ///     surfacing on the dashboard. Unknown node names are ignored.
        /// </summary>
        public void MarkFailed(string node, string? error)
        {
            lock (_lock)
            {
                var entry = FindEntry(node);
                if (entry == null)
                    return;

                var now = _clock();
                entry.FailureCount++;
                entry.LastError = error;
                entry.LastFailureTime = now;
                entry.CooldownUntil = _cooldown.TotalMilliseconds > 0 ? now + _cooldown : (DateTime?)null;
            }
        }

        /// <summary>
        ///     Marks <paramref name="node"/> as healthy immediately — clears any active cooldown but
        ///     leaves the cumulative failure counter intact for operator diagnostics. Unknown node
        ///     names are ignored.
        /// </summary>
        public void MarkHealthy(string node)
        {
            lock (_lock)
            {
                var entry = FindEntry(node);
                if (entry == null)
                    return;
                entry.CooldownUntil = null;
            }
        }

        /// <summary>
        ///     Captures the current per-node state for the health dashboard. Freshly computed from
        ///     <see cref="_clock"/> so recently-expired cooldowns are reported as healthy.
        /// </summary>
        public List<HistorianClusterNodeState> SnapshotNodeStates()
        {
            lock (_lock)
            {
                var now = _clock();
                return _nodes.Select(n => new HistorianClusterNodeState
                {
                    Name = n.Name,
                    IsHealthy = IsHealthyAt(n, now),
                    CooldownUntil = IsHealthyAt(n, now) ? null : n.CooldownUntil,
                    FailureCount = n.FailureCount,
                    LastError = n.LastError,
                    LastFailureTime = n.LastFailureTime
                }).ToList();
            }
        }

        private static bool IsHealthyAt(NodeEntry entry, DateTime now)
        {
            return entry.CooldownUntil == null || entry.CooldownUntil <= now;
        }

        private NodeEntry? FindEntry(string node)
        {
            for (var i = 0; i < _nodes.Count; i++)
                if (string.Equals(_nodes[i].Name, node, StringComparison.OrdinalIgnoreCase))
                    return _nodes[i];
            return null;
        }

        private sealed class NodeEntry
        {
            public string Name { get; set; } = "";
            public DateTime? CooldownUntil { get; set; }
            public int FailureCount { get; set; }
            public string? LastError { get; set; }
            public DateTime? LastFailureTime { get; set; }
        }
    }
}