PR 4.7 — Host-connectivity probes (IHostConnectivityProbe scaffold)
HostStatusAggregator merges transport + per-platform host entries with change-event diffing (re-asserting same state is a no-op so a stable ScanState=Running burst doesn't fan out duplicates). PerPlatformProbeWatcher ports the legacy GalaxyRuntimeProbeManager state machine onto the gw subscription path: SubscribeBulk for `<tag>.ScanState`, idempotent SyncPlatformsAsync (subscribe new, unsubscribe dropped), and a DecodeState helper pinning bool/int/string ScanState values + bad-quality fallback. HostConnectivityForwarder is the skeleton for the gw-6 StreamSessionHealth signal — until that mxaccessgw RPC ships, PR 4.5's ReconnectSupervisor pushes transport state by calling SetTransport on session connect/disconnect. GalaxyDriver wiring (implement IHostConnectivityProbe, route OnDataChange to PerPlatformProbeWatcher, expose GetHostStatuses() / OnHostStatusChanged, push transport from supervisor) is deferred to PR 4.W to avoid conflict with the rest of the Phase 4 deferred wiring (4.5 supervisor + 4.6 DeployWatcher). Tests: 19 new - HostStatusAggregatorTests (9): empty snapshot, new-host change with Unknown predecessor, same-state silence, transition diff, snapshot reflects every host, case-insensitive host names, Remove returns true for tracked, Remove false for unknown, concurrent updates don't corrupt. - HostConnectivityForwarderTests (5): SetTransport routes under client name, transitions fire change, repeated same-state silent, empty client name throws, post-dispose throws. - PerPlatformProbeWatcherTests (5 + theory pinning DecodeState's full truth table): subscribe N platforms, idempotent re-sync, removed platforms unsubscribed + dropped from aggregator, OnProbeValueChanged routing for Running/Stopped/bad-quality/foreign-ref, Dispose unsubscribes everything. NOTE: build is currently broken because mxaccessgw/clients/dotnet/ has been removed from C:\Users\dohertj2\Desktop\mxaccessgw — this PR's source is internally consistent and isolated from the missing dependency, but the existing Driver.Galaxy code (PRs 4.1–4.6) can't compile until the .NET client is restored. Once it is, expect 116 + 19 = 135 tests in the Driver.Galaxy.Tests project. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,58 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Pushes the synthetic top-level transport-health entry into the
|
||||
/// <see cref="HostStatusAggregator"/>. Each driver instance has one entry under its
|
||||
/// <c>MxAccess.ClientName</c> reflecting the gateway transport state — useful for
|
||||
/// dashboards that want a single "Galaxy is up" signal independent of any individual
|
||||
/// platform's ScanState.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The eventual production source for this signal is the gateway's <c>StreamSessionHealth</c>
|
||||
/// RPC (mxaccessgw issue gw-6). Until that ships, the driver-side reconnect supervisor
|
||||
/// (PR 4.5) calls <see cref="SetTransport"/> on transport state transitions:
|
||||
/// <see cref="HostState.Running"/> when the gw session re-Registers, <see cref="HostState.Stopped"/>
|
||||
/// when the supervisor moves to <c>TransportLost</c>. The forwarder is intentionally
|
||||
/// stateless beyond the cached client name + last-pushed value so the supervisor can
|
||||
/// drive it without any back-pressure plumbing.
|
||||
/// </remarks>
|
||||
public sealed class HostConnectivityForwarder : IDisposable
|
||||
{
|
||||
private readonly string _clientName;
|
||||
private readonly HostStatusAggregator _aggregator;
|
||||
private readonly ILogger _logger;
|
||||
private bool _disposed;
|
||||
|
||||
public HostConnectivityForwarder(string clientName, HostStatusAggregator aggregator, ILogger? logger = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(clientName);
|
||||
_clientName = clientName;
|
||||
_aggregator = aggregator ?? throw new ArgumentNullException(nameof(aggregator));
|
||||
_logger = logger ?? NullLogger.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Push a transport state into the aggregator. Idempotent at the aggregator layer —
|
||||
/// repeated calls with the same state don't fan out duplicate transitions.
|
||||
/// </summary>
|
||||
public void SetTransport(HostState state)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
var status = new HostConnectivityStatus(_clientName, state, DateTime.UtcNow);
|
||||
_aggregator.Update(status);
|
||||
_logger.LogDebug(
|
||||
"GalaxyDriver transport state for {ClientName}: {State}",
|
||||
_clientName, state);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
// No-op today; reserved for the eventual gw-6 StreamSessionHealth consumer that
|
||||
// will own a long-running task this method tears down.
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Pure-logic merger for the per-host connectivity entries that
|
||||
/// <see cref="IHostConnectivityProbe"/> surfaces. Holds the current set of host
|
||||
/// statuses (one synthetic top-level transport entry plus one entry per
|
||||
/// <c>$WinPlatform</c>/<c>$AppEngine</c> probe) and emits
|
||||
/// <see cref="OnHostStatusChanged"/> only when an upsert actually changes a host's
|
||||
/// <see cref="HostState"/> — re-asserting the same state is a no-op so a stable
|
||||
/// <c>ScanState=Running</c> burst doesn't fan out duplicate transitions.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This class owns the de-dup + diff logic that lived in
|
||||
/// <c>GalaxyProxyDriver.OnHostConnectivityUpdate</c> in v1. The watcher
|
||||
/// (<see cref="PerPlatformProbeWatcher"/>) and the transport forwarder
|
||||
/// (<see cref="HostConnectivityForwarder"/>) both feed this aggregator; the
|
||||
/// <see cref="GalaxyDriver"/> consumes <see cref="Snapshot"/> from
|
||||
/// <c>IHostConnectivityProbe.GetHostStatuses()</c> and re-raises
|
||||
/// <see cref="OnHostStatusChanged"/> as the driver-level event in a follow-up PR.
|
||||
/// </remarks>
|
||||
public sealed class HostStatusAggregator
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private readonly Dictionary<string, HostConnectivityStatus> _byHost =
|
||||
new(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
/// <summary>
|
||||
/// Fires when an <see cref="Update"/> call either introduces a new host or
|
||||
/// transitions an existing host's <see cref="HostState"/>. Handlers run
|
||||
/// outside the internal lock so they can safely re-enter the aggregator
|
||||
/// (e.g. the driver re-broadcasting through <c>IHostConnectivityProbe</c>).
|
||||
/// </summary>
|
||||
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot the current host set. Suitable as the body of
|
||||
/// <c>IHostConnectivityProbe.GetHostStatuses()</c>.
|
||||
/// </summary>
|
||||
public IReadOnlyList<HostConnectivityStatus> Snapshot()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return [.. _byHost.Values];
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Upsert the supplied status by <see cref="HostConnectivityStatus.HostName"/>.
|
||||
/// Raises <see cref="OnHostStatusChanged"/> when the host is newly tracked
|
||||
/// (previous state reported as <see cref="HostState.Unknown"/>) or when its
|
||||
/// state value differs from the last cached entry. Re-asserting the same
|
||||
/// state is silent.
|
||||
/// </summary>
|
||||
public void Update(HostConnectivityStatus status)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(status);
|
||||
|
||||
HostState previous;
|
||||
bool changed;
|
||||
lock (_lock)
|
||||
{
|
||||
if (_byHost.TryGetValue(status.HostName, out var existing))
|
||||
{
|
||||
previous = existing.State;
|
||||
changed = existing.State != status.State;
|
||||
}
|
||||
else
|
||||
{
|
||||
previous = HostState.Unknown;
|
||||
changed = true;
|
||||
}
|
||||
|
||||
_byHost[status.HostName] = status;
|
||||
}
|
||||
|
||||
if (changed)
|
||||
{
|
||||
OnHostStatusChanged?.Invoke(this,
|
||||
new HostStatusChangedEventArgs(status.HostName, previous, status.State));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Drop a host entirely (e.g. after a redeploy removes a Platform). No event
|
||||
/// is fired — observers only react to live transitions, not topology
|
||||
/// reductions. Returns <c>true</c> when the host was tracked.
|
||||
/// </summary>
|
||||
public bool Remove(string hostName)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
|
||||
lock (_lock)
|
||||
{
|
||||
return _byHost.Remove(hostName);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
|
||||
|
||||
/// <summary>
|
||||
/// Subscribes the <c>ScanState</c> attribute of every <c>$WinPlatform</c> /
|
||||
/// <c>$AppEngine</c> object the discoverer surfaced and translates ScanState
|
||||
/// value-changes into per-host <see cref="HostConnectivityStatus"/> updates.
|
||||
/// Ports the state machine in
|
||||
/// <c>Driver.Galaxy.Host/Backend/Stability/GalaxyRuntimeProbeManager.cs</c> onto the
|
||||
/// gateway subscription path.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Address grammar: each platform tag's probe address is
|
||||
/// <c>{platformTagName}.ScanState</c>. The watcher subscribes that address through
|
||||
/// <see cref="IGalaxySubscriber"/>; the EventPump (PR 4.4) routes inbound
|
||||
/// OnDataChange events back via <see cref="OnProbeValueChanged"/>. State decoding:
|
||||
/// <list type="bullet">
|
||||
/// <item>Quality < <c>192</c> (Good) → <see cref="HostState.Unknown"/>.</item>
|
||||
/// <item>Value <c>1</c>, <c>true</c>, or "Running" → <see cref="HostState.Running"/>.</item>
|
||||
/// <item>Value <c>0</c>, <c>false</c>, or "Stopped" → <see cref="HostState.Stopped"/>.</item>
|
||||
/// <item>Anything else with Good quality → <see cref="HostState.Faulted"/>.</item>
|
||||
/// </list>
|
||||
/// <see cref="SyncPlatformsAsync"/> is idempotent — call it after every
|
||||
/// Discover / Rediscover. Newly-added platforms are subscribed; removed ones are
|
||||
/// unsubscribed and dropped from the aggregator.
|
||||
/// </remarks>
|
||||
public sealed class PerPlatformProbeWatcher : IDisposable
|
||||
{
|
||||
public const string ProbeSuffix = ".ScanState";
|
||||
|
||||
private readonly IGalaxySubscriber _subscriber;
|
||||
private readonly HostStatusAggregator _aggregator;
|
||||
private readonly ILogger _logger;
|
||||
|
||||
// Tracked platform → gw item handle. Item handle 0 means the gw rejected the subscribe;
|
||||
// we keep the entry so SyncPlatformsAsync doesn't try to subscribe it again on every call.
|
||||
private readonly ConcurrentDictionary<string, int> _itemHandlesByPlatform =
|
||||
new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly Lock _syncLock = new();
|
||||
private bool _disposed;
|
||||
|
||||
public PerPlatformProbeWatcher(
|
||||
IGalaxySubscriber subscriber, HostStatusAggregator aggregator, ILogger? logger = null)
|
||||
{
|
||||
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
|
||||
_aggregator = aggregator ?? throw new ArgumentNullException(nameof(aggregator));
|
||||
_logger = logger ?? NullLogger.Instance;
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of platform tag names currently watched.</summary>
|
||||
public IReadOnlyCollection<string> WatchedPlatforms => [.. _itemHandlesByPlatform.Keys];
|
||||
|
||||
/// <summary>
|
||||
/// Reconcile the watched platform set against <paramref name="platformTagNames"/>.
|
||||
/// Subscribes new entries, unsubscribes dropped ones. Calling with the same set is
|
||||
/// a no-op.
|
||||
/// </summary>
|
||||
public async Task SyncPlatformsAsync(
|
||||
IEnumerable<string> platformTagNames, CancellationToken cancellationToken)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
ArgumentNullException.ThrowIfNull(platformTagNames);
|
||||
|
||||
var desired = new HashSet<string>(platformTagNames, StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Compute deltas under the lock so concurrent SyncPlatformsAsync calls don't
|
||||
// race on the membership view.
|
||||
List<string> toAdd;
|
||||
List<(string Platform, int ItemHandle)> toRemove;
|
||||
lock (_syncLock)
|
||||
{
|
||||
toAdd = [.. desired.Where(p => !_itemHandlesByPlatform.ContainsKey(p))];
|
||||
toRemove = [.. _itemHandlesByPlatform
|
||||
.Where(kvp => !desired.Contains(kvp.Key) && kvp.Value > 0)
|
||||
.Select(kvp => (kvp.Key, kvp.Value))];
|
||||
|
||||
// Drop removed entries from the membership map up-front so a concurrent
|
||||
// OnProbeValueChanged for them is silently ignored. The unsubscribe RPC
|
||||
// runs outside the lock.
|
||||
foreach (var (platform, _) in toRemove)
|
||||
{
|
||||
_itemHandlesByPlatform.TryRemove(platform, out _);
|
||||
_aggregator.Remove(platform);
|
||||
}
|
||||
}
|
||||
|
||||
if (toRemove.Count > 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _subscriber.UnsubscribeBulkAsync(
|
||||
[.. toRemove.Select(t => t.ItemHandle)], cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"PerPlatformProbeWatcher unsubscribe failed for {Count} probe(s); aggregator entries already cleared.",
|
||||
toRemove.Count);
|
||||
}
|
||||
}
|
||||
|
||||
if (toAdd.Count == 0) return;
|
||||
|
||||
var probeAddresses = toAdd.Select(p => p + ProbeSuffix).ToArray();
|
||||
// bufferedUpdateInterval=0 — probe ScanState changes are rare enough that the gw's
|
||||
// default cadence is fine; explicit polling rate goes through PR 6.3.
|
||||
var results = await _subscriber.SubscribeBulkAsync(
|
||||
probeAddresses, bufferedUpdateIntervalMs: 0, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
for (var i = 0; i < toAdd.Count; i++)
|
||||
{
|
||||
var platform = toAdd[i];
|
||||
var match = results.FirstOrDefault(r => string.Equals(
|
||||
r.TagAddress, probeAddresses[i], StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
|
||||
_itemHandlesByPlatform[platform] = itemHandle;
|
||||
|
||||
if (itemHandle <= 0)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"PerPlatformProbeWatcher subscribe failed for {Platform}: {Error}",
|
||||
platform, match?.ErrorMessage ?? "<no result returned>");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Route an OnDataChange for a probe address into the aggregator. The EventPump
|
||||
/// (PR 4.4) calls this; tests can drive it directly to exercise the state machine
|
||||
/// without spinning a real gw. Foreign references (anything not ending in
|
||||
/// <see cref="ProbeSuffix"/>, or a probe for a platform we're not tracking) are
|
||||
/// silently dropped.
|
||||
/// </summary>
|
||||
public void OnProbeValueChanged(string fullReference, object? value, byte qualityByte)
|
||||
{
|
||||
if (_disposed) return;
|
||||
ArgumentNullException.ThrowIfNull(fullReference);
|
||||
|
||||
if (!fullReference.EndsWith(ProbeSuffix, StringComparison.OrdinalIgnoreCase)) return;
|
||||
var platform = fullReference[..^ProbeSuffix.Length];
|
||||
if (!_itemHandlesByPlatform.ContainsKey(platform)) return;
|
||||
|
||||
var state = DecodeState(value, qualityByte);
|
||||
_aggregator.Update(new HostConnectivityStatus(platform, state, DateTime.UtcNow));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Decode a ScanState value + raw quality byte to a <see cref="HostState"/>.
|
||||
/// Public for tests that want to pin the decoding table.
|
||||
/// </summary>
|
||||
public static HostState DecodeState(object? value, byte qualityByte)
|
||||
{
|
||||
if (qualityByte < 192) return HostState.Unknown;
|
||||
|
||||
return value switch
|
||||
{
|
||||
bool b => b ? HostState.Running : HostState.Stopped,
|
||||
int i => i == 1 ? HostState.Running : i == 0 ? HostState.Stopped : HostState.Faulted,
|
||||
short s => s == 1 ? HostState.Running : s == 0 ? HostState.Stopped : HostState.Faulted,
|
||||
long l => l == 1 ? HostState.Running : l == 0 ? HostState.Stopped : HostState.Faulted,
|
||||
string str when string.Equals(str, "Running", StringComparison.OrdinalIgnoreCase) => HostState.Running,
|
||||
string str when string.Equals(str, "Stopped", StringComparison.OrdinalIgnoreCase) => HostState.Stopped,
|
||||
_ => HostState.Faulted,
|
||||
};
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
|
||||
// Best-effort unsubscribe everything we know about. Run synchronously through
|
||||
// GetAwaiter().GetResult() since Dispose is sync; transport errors are swallowed.
|
||||
var liveHandles = _itemHandlesByPlatform.Values.Where(h => h > 0).ToArray();
|
||||
_itemHandlesByPlatform.Clear();
|
||||
if (liveHandles.Length > 0)
|
||||
{
|
||||
try { _subscriber.UnsubscribeBulkAsync(liveHandles, CancellationToken.None).GetAwaiter().GetResult(); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "PerPlatformProbeWatcher dispose unsubscribe failed"); }
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user