Renames all 11 projects (5 src + 6 tests), the .slnx solution file, all source-file namespaces, all axaml namespace references, and all v1 documentation references in CLAUDE.md and docs/*.md (excluding docs/v2/ which is already in OtOpcUa form). Also updates the TopShelf service registration name from "LmxOpcUa" to "OtOpcUa" per Phase 0 Task 0.6.
Preserves runtime identifiers per Phase 0 Out-of-Scope rules to avoid breaking v1/v2 client trust during coexistence: OPC UA `ApplicationUri` defaults (`urn:{GalaxyName}:LmxOpcUa`), server `EndpointPath` (`/LmxOpcUa`), `ServerName` default (feeds cert subject CN), `MxAccessConfiguration.ClientName` default (defensive — stays "LmxOpcUa" for MxAccess audit-trail consistency), client OPC UA identifiers (`ApplicationName = "LmxOpcUaClient"`, `ApplicationUri = "urn:localhost:LmxOpcUaClient"`, cert directory `%LocalAppData%\LmxOpcUaClient\pki\`), and the `LmxOpcUaServer` class name (class rename out of Phase 0 scope per Task 0.5 sed pattern; happens in Phase 1 alongside `LmxNodeManager → GenericDriverNodeManager` Core extraction). 23 LmxOpcUa references retained, all enumerated and justified in `docs/v2/implementation/exit-gate-phase-0.md`.
Build clean: 0 errors, 30 warnings (lower than baseline 167). Tests at strict improvement over baseline: 821 passing / 1 failing vs baseline 820 / 2 (one flaky pre-existing failure passed this run; the other still fails — both pre-existing and unrelated to the rename). `Client.UI.Tests`, `Historian.Aveva.Tests`, `Client.Shared.Tests`, `IntegrationTests` all match baseline exactly. Exit gate compliance results recorded in `docs/v2/implementation/exit-gate-phase-0.md` with all 7 checks PASS or DEFERRED-to-PR-review (#7 service install verification needs Windows service permissions on the reviewer's box).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
473 lines
21 KiB
C#
473 lines
21 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Threading.Tasks;
|
|
using Serilog;
|
|
using ZB.MOM.WW.OtOpcUa.Host.Domain;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Host.MxAccess
|
|
{
|
|
/// <summary>
|
|
/// Advises <c><ObjectName>.ScanState</c> on every deployed <c>$WinPlatform</c> and
|
|
/// <c>$AppEngine</c>, tracks their runtime state (Unknown / Running / Stopped), and notifies
|
|
/// the owning node manager on Running↔Stopped transitions so it can proactively flip every
|
|
/// OPC UA variable hosted by that object to <c>BadOutOfService</c> (and clear on recovery).
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// State machine semantics are documented in <c>runtimestatus.md</c>. Key facts:
|
|
/// <list type="bullet">
|
|
/// <item><c>ScanState</c> is delivered on-change only — no periodic heartbeat. A stably
|
|
/// Running host may go hours without a callback.</item>
|
|
/// <item>Running → Stopped is driven by explicit error callbacks or <c>ScanState = false</c>,
|
|
/// NEVER by starvation. The only starvation check applies to the initial Unknown state.</item>
|
|
/// <item>When the MxAccess transport is disconnected, <see cref="GetSnapshot"/> returns every
|
|
/// entry with <see cref="GalaxyRuntimeState.Unknown"/> regardless of the underlying state,
|
|
/// because we can't observe anything through a dead transport.</item>
|
|
/// <item>The stop/start callbacks fire synchronously from whichever thread delivered the
|
|
/// probe update. The manager releases its own lock before invoking them to avoid
|
|
/// lock-inversion deadlocks with the node manager's <c>Lock</c>.</item>
|
|
/// </list>
|
|
/// </remarks>
|
|
public sealed class GalaxyRuntimeProbeManager : IDisposable
|
|
{
|
|
private static readonly ILogger Log = Serilog.Log.ForContext<GalaxyRuntimeProbeManager>();
|
|
|
|
private const int CategoryWinPlatform = 1;
|
|
private const int CategoryAppEngine = 3;
|
|
private const string KindWinPlatform = "$WinPlatform";
|
|
private const string KindAppEngine = "$AppEngine";
|
|
private const string ProbeAttribute = ".ScanState";
|
|
|
|
private readonly IMxAccessClient _client;
|
|
private readonly TimeSpan _unknownTimeout;
|
|
private readonly Action<int>? _onHostStopped;
|
|
private readonly Action<int>? _onHostRunning;
|
|
private readonly Func<DateTime> _clock;
|
|
|
|
// Key: probe tag reference (e.g. "DevAppEngine.ScanState").
|
|
// Value: the current runtime status for that host, kept in sync on every probe callback
|
|
// and queried via GetSnapshot for dashboard rendering.
|
|
private readonly Dictionary<string, GalaxyRuntimeStatus> _byProbe =
|
|
new Dictionary<string, GalaxyRuntimeStatus>(StringComparer.OrdinalIgnoreCase);
|
|
|
|
// Reverse index: gobject_id -> probe tag, so Sync() can diff new/removed hosts efficiently.
|
|
private readonly Dictionary<int, string> _probeByGobjectId = new Dictionary<int, string>();
|
|
|
|
private readonly object _lock = new object();
|
|
private bool _disposed;
|
|
|
|
/// <summary>
|
|
/// Initializes a new probe manager. <paramref name="onHostStopped"/> and
|
|
/// <paramref name="onHostRunning"/> are invoked synchronously on Running↔Stopped
|
|
/// transitions so the owning node manager can invalidate / restore the hosted subtree.
|
|
/// </summary>
|
|
public GalaxyRuntimeProbeManager(
|
|
IMxAccessClient client,
|
|
int unknownTimeoutSeconds,
|
|
Action<int>? onHostStopped = null,
|
|
Action<int>? onHostRunning = null)
|
|
: this(client, unknownTimeoutSeconds, onHostStopped, onHostRunning, () => DateTime.UtcNow)
|
|
{
|
|
}
|
|
|
|
internal GalaxyRuntimeProbeManager(
|
|
IMxAccessClient client,
|
|
int unknownTimeoutSeconds,
|
|
Action<int>? onHostStopped,
|
|
Action<int>? onHostRunning,
|
|
Func<DateTime> clock)
|
|
{
|
|
_client = client ?? throw new ArgumentNullException(nameof(client));
|
|
_unknownTimeout = TimeSpan.FromSeconds(Math.Max(1, unknownTimeoutSeconds));
|
|
_onHostStopped = onHostStopped;
|
|
_onHostRunning = onHostRunning;
|
|
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the number of active probe subscriptions. Surfaced on the dashboard Subscriptions
|
|
/// panel so operators can see bridge-owned probe count separately from the total.
|
|
/// </summary>
|
|
public int ActiveProbeCount
|
|
{
|
|
get
|
|
{
|
|
lock (_lock)
|
|
return _byProbe.Count;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns <see langword="true"/> when the galaxy runtime host identified by
|
|
/// <paramref name="gobjectId"/> is currently in the <see cref="GalaxyRuntimeState.Stopped"/>
|
|
/// state. Used by the node manager's Read path to short-circuit on-demand reads of tags
|
|
/// hosted by a known-stopped runtime object, preventing MxAccess from serving stale
|
|
/// cached values as Good. Unlike <see cref="GetSnapshot"/> this check uses the
|
|
/// underlying state directly — transport-disconnected hosts will NOT report Stopped here
|
|
/// (they report their last-known state), because connection-loss is handled by the
|
|
/// normal MxAccess error paths and we don't want this method to double-flag.
|
|
/// </summary>
|
|
public bool IsHostStopped(int gobjectId)
|
|
{
|
|
lock (_lock)
|
|
{
|
|
if (_probeByGobjectId.TryGetValue(gobjectId, out var probe)
|
|
&& _byProbe.TryGetValue(probe, out var status))
|
|
{
|
|
return status.State == GalaxyRuntimeState.Stopped;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a point-in-time clone of the runtime status for the host identified by
|
|
/// <paramref name="gobjectId"/>, or <see langword="null"/> when no probe is registered
|
|
/// for that object. Used by the node manager to populate the synthetic <c>$RuntimeState</c>
|
|
/// child variables on each host object. Uses the underlying state directly (not the
|
|
/// transport-gated rewrite), matching <see cref="IsHostStopped"/>.
|
|
/// </summary>
|
|
public GalaxyRuntimeStatus? GetHostStatus(int gobjectId)
|
|
{
|
|
lock (_lock)
|
|
{
|
|
if (_probeByGobjectId.TryGetValue(gobjectId, out var probe)
|
|
&& _byProbe.TryGetValue(probe, out var status))
|
|
{
|
|
return Clone(status, forceUnknown: false);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Diffs the supplied hierarchy against the active probe set, advising new hosts and
|
|
/// unadvising removed ones. The hierarchy is filtered to runtime host categories
|
|
/// ($WinPlatform, $AppEngine) — non-host rows are ignored. Idempotent: a second call
|
|
/// with the same hierarchy performs no Advise / Unadvise work.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Sync is synchronous on MxAccess: <see cref="IMxAccessClient.SubscribeAsync"/> is
|
|
/// awaited for each new host, so for a galaxy with N runtime hosts the call blocks for
|
|
/// ~N round-trips. This is acceptable because it only runs during address-space build
|
|
/// and rebuild, not on the hot path.
|
|
/// </remarks>
|
|
public async Task SyncAsync(IReadOnlyList<GalaxyObjectInfo> hierarchy)
|
|
{
|
|
if (_disposed || hierarchy == null)
|
|
return;
|
|
|
|
// Filter to runtime hosts and project to the expected probe tag name.
|
|
var desired = new Dictionary<int, (string Probe, string Kind, GalaxyObjectInfo Obj)>();
|
|
foreach (var obj in hierarchy)
|
|
{
|
|
if (obj.CategoryId != CategoryWinPlatform && obj.CategoryId != CategoryAppEngine)
|
|
continue;
|
|
if (string.IsNullOrWhiteSpace(obj.TagName))
|
|
continue;
|
|
var probe = obj.TagName + ProbeAttribute;
|
|
var kind = obj.CategoryId == CategoryWinPlatform ? KindWinPlatform : KindAppEngine;
|
|
desired[obj.GobjectId] = (probe, kind, obj);
|
|
}
|
|
|
|
// Compute diffs under lock, release lock before issuing SDK calls (which can block).
|
|
// toSubscribe carries the gobject id alongside the probe name so the rollback path on
|
|
// subscribe failure can unwind both dictionaries without a reverse lookup.
|
|
List<(int GobjectId, string Probe)> toSubscribe;
|
|
List<string> toUnsubscribe;
|
|
lock (_lock)
|
|
{
|
|
toSubscribe = new List<(int, string)>();
|
|
toUnsubscribe = new List<string>();
|
|
|
|
foreach (var kvp in desired)
|
|
{
|
|
if (_probeByGobjectId.TryGetValue(kvp.Key, out var existingProbe))
|
|
{
|
|
// Already tracked: ensure the status entry is aligned (tag rename path is
|
|
// intentionally not supported — if the probe changed, treat it as remove+add).
|
|
if (!string.Equals(existingProbe, kvp.Value.Probe, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
toUnsubscribe.Add(existingProbe);
|
|
_byProbe.Remove(existingProbe);
|
|
_probeByGobjectId.Remove(kvp.Key);
|
|
|
|
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
|
|
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
|
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
|
|
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
|
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
|
}
|
|
}
|
|
|
|
// Remove hosts that are no longer in the desired set.
|
|
var toRemove = _probeByGobjectId.Keys.Where(id => !desired.ContainsKey(id)).ToList();
|
|
foreach (var id in toRemove)
|
|
{
|
|
var probe = _probeByGobjectId[id];
|
|
toUnsubscribe.Add(probe);
|
|
_byProbe.Remove(probe);
|
|
_probeByGobjectId.Remove(id);
|
|
}
|
|
}
|
|
|
|
// Apply the diff outside the lock.
|
|
foreach (var (gobjectId, probe) in toSubscribe)
|
|
{
|
|
try
|
|
{
|
|
await _client.SubscribeAsync(probe, OnProbeValueChanged);
|
|
Log.Information("Galaxy runtime probe advised: {Probe}", probe);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe);
|
|
|
|
// Roll back the pending entry so Tick() can't later transition a never-advised
|
|
// probe from Unknown to Stopped and fan out a false-negative host-down signal.
|
|
// A concurrent SyncAsync may have re-added the same gobject under a new probe
|
|
// name, so compare against the captured probe string before removing.
|
|
lock (_lock)
|
|
{
|
|
if (_probeByGobjectId.TryGetValue(gobjectId, out var current)
|
|
&& string.Equals(current, probe, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
_probeByGobjectId.Remove(gobjectId);
|
|
}
|
|
_byProbe.Remove(probe);
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach (var probe in toUnsubscribe)
|
|
{
|
|
try
|
|
{
|
|
await _client.UnsubscribeAsync(probe);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during sync", probe);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Routes an <c>OnTagValueChanged</c> callback to the probe state machine. Returns
|
|
/// <see langword="true"/> when <paramref name="tagRef"/> matches a bridge-owned probe
|
|
/// (in which case the owning node manager should skip its normal variable-update path).
|
|
/// </summary>
|
|
public bool HandleProbeUpdate(string tagRef, Vtq vtq)
|
|
{
|
|
if (_disposed || string.IsNullOrEmpty(tagRef))
|
|
return false;
|
|
|
|
GalaxyRuntimeStatus? status;
|
|
int fromToGobjectId = 0;
|
|
GalaxyRuntimeState? transitionTo = null;
|
|
|
|
lock (_lock)
|
|
{
|
|
if (!_byProbe.TryGetValue(tagRef, out status))
|
|
return false; // not a probe — let the caller handle it normally
|
|
|
|
var now = _clock();
|
|
var isRunning = vtq.Quality.IsGood() && vtq.Value is bool b && b;
|
|
status.LastStateCallbackTime = now;
|
|
status.LastScanState = vtq.Value as bool?;
|
|
|
|
if (isRunning)
|
|
{
|
|
status.GoodUpdateCount++;
|
|
status.LastError = null;
|
|
if (status.State != GalaxyRuntimeState.Running)
|
|
{
|
|
// Only fire the host-running callback on a true Stopped → Running
|
|
// recovery. Unknown → Running happens once at startup for every host
|
|
// and is not a recovery — firing ClearHostVariablesBadQuality there
|
|
// would wipe Bad status set by the concurrently-stopping other host
|
|
// on variables that span both lists.
|
|
var wasStopped = status.State == GalaxyRuntimeState.Stopped;
|
|
status.State = GalaxyRuntimeState.Running;
|
|
status.LastStateChangeTime = now;
|
|
if (wasStopped)
|
|
{
|
|
transitionTo = GalaxyRuntimeState.Running;
|
|
fromToGobjectId = status.GobjectId;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
status.FailureCount++;
|
|
status.LastError = BuildErrorDetail(vtq);
|
|
if (status.State != GalaxyRuntimeState.Stopped)
|
|
{
|
|
status.State = GalaxyRuntimeState.Stopped;
|
|
status.LastStateChangeTime = now;
|
|
transitionTo = GalaxyRuntimeState.Stopped;
|
|
fromToGobjectId = status.GobjectId;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Invoke transition callbacks outside the lock to avoid inverting the node manager's
|
|
// lock order when it subsequently takes its own Lock to flip hosted variables.
|
|
if (transitionTo == GalaxyRuntimeState.Stopped)
|
|
{
|
|
Log.Information("Galaxy runtime {Probe} transitioned Running → Stopped ({Err})",
|
|
tagRef, status?.LastError ?? "(no detail)");
|
|
try { _onHostStopped?.Invoke(fromToGobjectId); }
|
|
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw for {Probe}", tagRef); }
|
|
}
|
|
else if (transitionTo == GalaxyRuntimeState.Running)
|
|
{
|
|
Log.Information("Galaxy runtime {Probe} transitioned → Running", tagRef);
|
|
try { _onHostRunning?.Invoke(fromToGobjectId); }
|
|
catch (Exception ex) { Log.Warning(ex, "onHostRunning callback threw for {Probe}", tagRef); }
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Periodic tick — flips Unknown entries to Stopped once their registration has been
|
|
/// outstanding for longer than the configured timeout without ever receiving a first
|
|
/// callback. Does nothing to Running or Stopped entries.
|
|
/// </summary>
|
|
public void Tick()
|
|
{
|
|
if (_disposed)
|
|
return;
|
|
|
|
var transitions = new List<int>();
|
|
lock (_lock)
|
|
{
|
|
var now = _clock();
|
|
foreach (var entry in _byProbe.Values)
|
|
{
|
|
if (entry.State != GalaxyRuntimeState.Unknown)
|
|
continue;
|
|
|
|
// LastStateChangeTime is set at creation to "now" so the timeout is measured
|
|
// from when the probe was advised.
|
|
if (entry.LastStateChangeTime.HasValue
|
|
&& now - entry.LastStateChangeTime.Value > _unknownTimeout)
|
|
{
|
|
entry.State = GalaxyRuntimeState.Stopped;
|
|
entry.LastStateChangeTime = now;
|
|
entry.FailureCount++;
|
|
entry.LastError = "Probe never received an initial callback within the unknown-resolution timeout";
|
|
transitions.Add(entry.GobjectId);
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach (var gobjectId in transitions)
|
|
{
|
|
Log.Warning("Galaxy runtime gobject {GobjectId} timed out in Unknown state → Stopped", gobjectId);
|
|
try { _onHostStopped?.Invoke(gobjectId); }
|
|
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw during tick for {GobjectId}", gobjectId); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a read-only snapshot of every tracked host. When the MxAccess transport is
|
|
/// disconnected, every entry is rewritten to Unknown on the way out so operators aren't
|
|
/// misled by cached per-host state — the Connection panel is the primary signal in that
|
|
/// case. The underlying <c>_byProbe</c> map is not modified.
|
|
/// </summary>
|
|
public IReadOnlyList<GalaxyRuntimeStatus> GetSnapshot()
|
|
{
|
|
var transportDown = _client.State != ConnectionState.Connected;
|
|
|
|
lock (_lock)
|
|
{
|
|
var result = new List<GalaxyRuntimeStatus>(_byProbe.Count);
|
|
foreach (var entry in _byProbe.Values)
|
|
result.Add(Clone(entry, forceUnknown: transportDown));
|
|
// Stable ordering by name so dashboard rows don't jitter between refreshes.
|
|
result.Sort((a, b) => string.CompareOrdinal(a.ObjectName, b.ObjectName));
|
|
return result;
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public void Dispose()
|
|
{
|
|
List<string> probes;
|
|
lock (_lock)
|
|
{
|
|
if (_disposed)
|
|
return;
|
|
_disposed = true;
|
|
probes = _byProbe.Keys.ToList();
|
|
_byProbe.Clear();
|
|
_probeByGobjectId.Clear();
|
|
}
|
|
|
|
foreach (var probe in probes)
|
|
{
|
|
try
|
|
{
|
|
_client.UnsubscribeAsync(probe).GetAwaiter().GetResult();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during Dispose", probe);
|
|
}
|
|
}
|
|
}
|
|
|
|
private void OnProbeValueChanged(string tagRef, Vtq vtq)
|
|
{
|
|
HandleProbeUpdate(tagRef, vtq);
|
|
}
|
|
|
|
private GalaxyRuntimeStatus MakeInitialStatus(GalaxyObjectInfo obj, string kind)
|
|
{
|
|
return new GalaxyRuntimeStatus
|
|
{
|
|
ObjectName = obj.TagName,
|
|
GobjectId = obj.GobjectId,
|
|
Kind = kind,
|
|
State = GalaxyRuntimeState.Unknown,
|
|
LastStateChangeTime = _clock()
|
|
};
|
|
}
|
|
|
|
private static GalaxyRuntimeStatus Clone(GalaxyRuntimeStatus src, bool forceUnknown)
|
|
{
|
|
return new GalaxyRuntimeStatus
|
|
{
|
|
ObjectName = src.ObjectName,
|
|
GobjectId = src.GobjectId,
|
|
Kind = src.Kind,
|
|
State = forceUnknown ? GalaxyRuntimeState.Unknown : src.State,
|
|
LastStateCallbackTime = src.LastStateCallbackTime,
|
|
LastStateChangeTime = src.LastStateChangeTime,
|
|
LastScanState = src.LastScanState,
|
|
LastError = forceUnknown ? null : src.LastError,
|
|
GoodUpdateCount = src.GoodUpdateCount,
|
|
FailureCount = src.FailureCount
|
|
};
|
|
}
|
|
|
|
private static string BuildErrorDetail(Vtq vtq)
|
|
{
|
|
if (vtq.Quality.IsBad())
|
|
return $"bad quality ({vtq.Quality})";
|
|
if (vtq.Quality.IsUncertain())
|
|
return $"uncertain quality ({vtq.Quality})";
|
|
if (vtq.Value is bool b && !b)
|
|
return "ScanState = false (OffScan)";
|
|
return $"unexpected value: {vtq.Value ?? "(null)"}";
|
|
}
|
|
}
|
|
}
|