Track Galaxy Platform and AppEngine runtime state via ScanState probes and proactively invalidate descendant variable quality on Stopped transitions so operators can detect a stopped runtime host before downstream clients read stale data and so the bridge delivers a uniform bad-quality signal instead of relying on MxAccess per-tag fan-out

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 15:40:44 -04:00
parent 8f340553d9
commit 9d49cdcc58
18 changed files with 1831 additions and 14 deletions

View File

@@ -73,6 +73,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
"MxAccess.MonitorInterval={MonitorInterval}s, AutoReconnect={AutoReconnect}, ProbeTag={ProbeTag}, ProbeStaleThreshold={ProbeStale}s",
config.MxAccess.MonitorIntervalSeconds, config.MxAccess.AutoReconnect,
config.MxAccess.ProbeTag ?? "(none)", config.MxAccess.ProbeStaleThresholdSeconds);
Log.Information(
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s",
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
if (string.IsNullOrWhiteSpace(config.MxAccess.ClientName))
{
@@ -80,6 +83,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
valid = false;
}
if (config.MxAccess.RuntimeStatusUnknownTimeoutSeconds < 5)
Log.Warning(
"MxAccess.RuntimeStatusUnknownTimeoutSeconds={Timeout} is below the recommended floor of 5s; initial probe resolution may time out before MxAccess has delivered the first callback",
config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
// Galaxy Repository
Log.Information(
"GalaxyRepository.ConnectionString={ConnectionString}, ChangeDetectionInterval={ChangeInterval}s, CommandTimeout={CmdTimeout}s, ExtendedAttributes={ExtendedAttributes}",

View File

@@ -55,5 +55,22 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
/// Gets or sets the number of seconds a probe value may remain unchanged before the connection is considered stale.
/// </summary>
public int ProbeStaleThresholdSeconds { get; set; } = 60;
/// <summary>
/// Gets or sets a value indicating whether the bridge advises <c>&lt;ObjectName&gt;.ScanState</c> for every
/// deployed <c>$WinPlatform</c> and <c>$AppEngine</c>, reporting per-host runtime state on the status
/// dashboard and proactively invalidating OPC UA variable quality when a host transitions to Stopped.
/// Enabled by default. Disable to return to legacy behavior where host runtime state is invisible and
/// MxAccess's per-tag bad-quality fan-out is the only stop signal.
/// </summary>
public bool RuntimeStatusProbesEnabled { get; set; } = true;
/// <summary>
/// Gets or sets the maximum seconds to wait for the initial probe callback before marking a host as
/// Stopped. Only applies to the Unknown → Stopped transition. Because <c>ScanState</c> is delivered
/// on-change only, a stably Running host does not time out — no starvation check runs on Running
/// entries. Default 15s.
/// </summary>
public int RuntimeStatusUnknownTimeoutSeconds { get; set; } = 15;
}
}

View File

@@ -44,5 +44,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
/// <see cref="AlarmObjectFilter"/> to decide whether an object's alarms should be monitored.
/// </summary>
public List<string> TemplateChain { get; set; } = new();
/// <summary>
/// Gets or sets the Galaxy template category id for this object. Category 1 is $WinPlatform,
/// 3 is $AppEngine, 13 is $Area, 10 is $UserDefined, and so on. Populated from
/// <c>template_definition.category_id</c> by <c>hierarchy.sql</c> and consumed by the runtime
/// status probe manager to identify hosts that should receive a <c>ScanState</c> probe.
/// </summary>
public int CategoryId { get; set; }
/// <summary>
/// Gets or sets the Galaxy object id of this object's runtime host, populated from
/// <c>gobject.hosted_by_gobject_id</c>. Walk this chain upward to find the nearest
/// <c>$WinPlatform</c> or <c>$AppEngine</c> ancestor for subtree quality invalidation when
/// a runtime host is reported Stopped. Zero for root objects that have no host.
/// </summary>
public int HostedByGobjectId { get; set; }
}
}

View File

@@ -0,0 +1,29 @@
namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
{
/// <summary>
/// Runtime state of a deployed Galaxy runtime host ($WinPlatform or $AppEngine) as
/// observed by the bridge via its <c>ScanState</c> probe.
/// </summary>
public enum GalaxyRuntimeState
{
/// <summary>
/// Probe advised but no callback received yet. Transitions to <see cref="Running"/>
/// on the first successful <c>ScanState = true</c> callback, or to <see cref="Stopped"/>
/// once the unknown-resolution timeout elapses.
/// </summary>
Unknown,
/// <summary>
/// Last probe callback reported <c>ScanState = true</c> with a successful item status.
/// The host is on scan and executing.
/// </summary>
Running,
/// <summary>
/// Last probe callback reported <c>ScanState != true</c>, or a failed item status, or
/// the initial probe never resolved before the unknown timeout elapsed. The host is
/// off scan or unreachable.
/// </summary>
Stopped
}
}

View File

@@ -0,0 +1,72 @@
using System;
namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
{
/// <summary>
/// Point-in-time runtime state of a single Galaxy runtime host ($WinPlatform or $AppEngine)
/// as tracked by the <c>GalaxyRuntimeProbeManager</c>. Surfaced on the status dashboard and
/// consumed by <c>HealthCheckService</c> so operators can detect a stopped host before
/// downstream clients notice the stale data.
/// </summary>
public sealed class GalaxyRuntimeStatus
{
/// <summary>
/// Gets or sets the Galaxy tag_name of the host (e.g., <c>DevPlatform</c> or
/// <c>DevAppEngine</c>).
/// </summary>
public string ObjectName { get; set; } = "";
/// <summary>
/// Gets or sets the Galaxy gobject_id of the host.
/// </summary>
public int GobjectId { get; set; }
/// <summary>
/// Gets or sets the Galaxy template category name — <c>$WinPlatform</c> or
/// <c>$AppEngine</c>. Used by the dashboard to group hosts by kind.
/// </summary>
public string Kind { get; set; } = "";
/// <summary>
/// Gets or sets the current runtime state.
/// </summary>
public GalaxyRuntimeState State { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the most recent probe callback, whether it
/// reported success or failure. <see langword="null"/> before the first callback.
/// </summary>
public DateTime? LastStateCallbackTime { get; set; }
/// <summary>
/// Gets or sets the UTC timestamp of the most recent <see cref="State"/> transition.
/// Backs the dashboard "Since" column. <see langword="null"/> in the initial Unknown
/// state before any transition.
/// </summary>
public DateTime? LastStateChangeTime { get; set; }
/// <summary>
/// Gets or sets the last <c>ScanState</c> value received from the probe, or
/// <see langword="null"/> before the first update or when the last callback carried
/// a non-success item status (no value delivered).
/// </summary>
public bool? LastScanState { get; set; }
/// <summary>
/// Gets or sets the detail message from the most recent failure callback, cleared on
/// the next successful <c>ScanState = true</c> delivery.
/// </summary>
public string? LastError { get; set; }
/// <summary>
/// Gets or sets the cumulative number of callbacks where <c>ScanState = true</c>.
/// </summary>
public long GoodUpdateCount { get; set; }
/// <summary>
/// Gets or sets the cumulative number of callbacks where <c>ScanState != true</c>
/// or the item status reported failure.
/// </summary>
public long FailureCount { get; set; }
}
}

View File

@@ -50,7 +50,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository
while (await reader.ReadAsync(ct))
{
var templateChainRaw = reader.IsDBNull(6) ? "" : reader.GetString(6);
var templateChainRaw = reader.IsDBNull(8) ? "" : reader.GetString(8);
var templateChain = string.IsNullOrEmpty(templateChainRaw)
? new List<string>()
: templateChainRaw.Split(new[] { '|' }, StringSplitOptions.RemoveEmptyEntries)
@@ -66,6 +66,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository
BrowseName = reader.GetString(3),
ParentGobjectId = Convert.ToInt32(reader.GetValue(4)),
IsArea = Convert.ToInt32(reader.GetValue(5)) == 1,
CategoryId = Convert.ToInt32(reader.GetValue(6)),
HostedByGobjectId = Convert.ToInt32(reader.GetValue(7)),
TemplateChain = templateChain
});
}
@@ -234,6 +236,8 @@ SELECT DISTINCT
THEN 1
ELSE 0
END AS is_area,
td.category_id AS category_id,
g.hosted_by_gobject_id AS hosted_by_gobject_id,
ISNULL(
STUFF((
SELECT '|' + tc.template_tag_name

View File

@@ -0,0 +1,404 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Serilog;
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
{
/// <summary>
/// Advises <c>&lt;ObjectName&gt;.ScanState</c> on every deployed <c>$WinPlatform</c> and
/// <c>$AppEngine</c>, tracks their runtime state (Unknown / Running / Stopped), and notifies
/// the owning node manager on Running↔Stopped transitions so it can proactively flip every
/// OPC UA variable hosted by that object to <c>BadOutOfService</c> (and clear on recovery).
/// </summary>
/// <remarks>
/// State machine semantics are documented in <c>runtimestatus.md</c>. Key facts:
/// <list type="bullet">
/// <item><c>ScanState</c> is delivered on-change only — no periodic heartbeat. A stably
/// Running host may go hours without a callback.</item>
/// <item>Running → Stopped is driven by explicit error callbacks or <c>ScanState = false</c>,
/// NEVER by starvation. The only starvation check applies to the initial Unknown state.</item>
/// <item>When the MxAccess transport is disconnected, <see cref="GetSnapshot"/> returns every
/// entry with <see cref="GalaxyRuntimeState.Unknown"/> regardless of the underlying state,
/// because we can't observe anything through a dead transport.</item>
/// <item>The stop/start callbacks fire synchronously from whichever thread delivered the
/// probe update. The manager releases its own lock before invoking them to avoid
/// lock-inversion deadlocks with the node manager's <c>Lock</c>.</item>
/// </list>
/// </remarks>
public sealed class GalaxyRuntimeProbeManager : IDisposable
{
private static readonly ILogger Log = Serilog.Log.ForContext<GalaxyRuntimeProbeManager>();
private const int CategoryWinPlatform = 1;
private const int CategoryAppEngine = 3;
private const string KindWinPlatform = "$WinPlatform";
private const string KindAppEngine = "$AppEngine";
private const string ProbeAttribute = ".ScanState";
private readonly IMxAccessClient _client;
private readonly TimeSpan _unknownTimeout;
private readonly Action<int>? _onHostStopped;
private readonly Action<int>? _onHostRunning;
private readonly Func<DateTime> _clock;
// Key: probe tag reference (e.g. "DevAppEngine.ScanState").
// Value: the current runtime status for that host, kept in sync on every probe callback
// and queried via GetSnapshot for dashboard rendering.
private readonly Dictionary<string, GalaxyRuntimeStatus> _byProbe =
new Dictionary<string, GalaxyRuntimeStatus>(StringComparer.OrdinalIgnoreCase);
// Reverse index: gobject_id -> probe tag, so Sync() can diff new/removed hosts efficiently.
private readonly Dictionary<int, string> _probeByGobjectId = new Dictionary<int, string>();
private readonly object _lock = new object();
private bool _disposed;
/// <summary>
/// Initializes a new probe manager. <paramref name="onHostStopped"/> and
/// <paramref name="onHostRunning"/> are invoked synchronously on Running↔Stopped
/// transitions so the owning node manager can invalidate / restore the hosted subtree.
/// </summary>
public GalaxyRuntimeProbeManager(
IMxAccessClient client,
int unknownTimeoutSeconds,
Action<int>? onHostStopped = null,
Action<int>? onHostRunning = null)
: this(client, unknownTimeoutSeconds, onHostStopped, onHostRunning, () => DateTime.UtcNow)
{
}
internal GalaxyRuntimeProbeManager(
IMxAccessClient client,
int unknownTimeoutSeconds,
Action<int>? onHostStopped,
Action<int>? onHostRunning,
Func<DateTime> clock)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
_unknownTimeout = TimeSpan.FromSeconds(Math.Max(1, unknownTimeoutSeconds));
_onHostStopped = onHostStopped;
_onHostRunning = onHostRunning;
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
}
/// <summary>
/// Gets the number of active probe subscriptions. Surfaced on the dashboard Subscriptions
/// panel so operators can see bridge-owned probe count separately from the total.
/// </summary>
public int ActiveProbeCount
{
get
{
lock (_lock)
return _byProbe.Count;
}
}
/// <summary>
/// Diffs the supplied hierarchy against the active probe set, advising new hosts and
/// unadvising removed ones. The hierarchy is filtered to runtime host categories
/// ($WinPlatform, $AppEngine) — non-host rows are ignored. Idempotent: a second call
/// with the same hierarchy performs no Advise / Unadvise work.
/// </summary>
/// <remarks>
/// Sync is synchronous on MxAccess: <see cref="IMxAccessClient.SubscribeAsync"/> is
/// awaited for each new host, so for a galaxy with N runtime hosts the call blocks for
/// ~N round-trips. This is acceptable because it only runs during address-space build
/// and rebuild, not on the hot path.
/// </remarks>
public async Task SyncAsync(IReadOnlyList<GalaxyObjectInfo> hierarchy)
{
if (_disposed || hierarchy == null)
return;
// Filter to runtime hosts and project to the expected probe tag name.
var desired = new Dictionary<int, (string Probe, string Kind, GalaxyObjectInfo Obj)>();
foreach (var obj in hierarchy)
{
if (obj.CategoryId != CategoryWinPlatform && obj.CategoryId != CategoryAppEngine)
continue;
if (string.IsNullOrWhiteSpace(obj.TagName))
continue;
var probe = obj.TagName + ProbeAttribute;
var kind = obj.CategoryId == CategoryWinPlatform ? KindWinPlatform : KindAppEngine;
desired[obj.GobjectId] = (probe, kind, obj);
}
// Compute diffs under lock, release lock before issuing SDK calls (which can block).
List<string> toSubscribe;
List<string> toUnsubscribe;
lock (_lock)
{
toSubscribe = new List<string>();
toUnsubscribe = new List<string>();
foreach (var kvp in desired)
{
if (_probeByGobjectId.TryGetValue(kvp.Key, out var existingProbe))
{
// Already tracked: ensure the status entry is aligned (tag rename path is
// intentionally not supported — if the probe changed, treat it as remove+add).
if (!string.Equals(existingProbe, kvp.Value.Probe, StringComparison.OrdinalIgnoreCase))
{
toUnsubscribe.Add(existingProbe);
_byProbe.Remove(existingProbe);
_probeByGobjectId.Remove(kvp.Key);
toSubscribe.Add(kvp.Value.Probe);
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
}
}
else
{
toSubscribe.Add(kvp.Value.Probe);
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
}
}
// Remove hosts that are no longer in the desired set.
var toRemove = _probeByGobjectId.Keys.Where(id => !desired.ContainsKey(id)).ToList();
foreach (var id in toRemove)
{
var probe = _probeByGobjectId[id];
toUnsubscribe.Add(probe);
_byProbe.Remove(probe);
_probeByGobjectId.Remove(id);
}
}
// Apply the diff outside the lock.
foreach (var probe in toSubscribe)
{
try
{
await _client.SubscribeAsync(probe, OnProbeValueChanged);
Log.Information("Galaxy runtime probe advised: {Probe}", probe);
}
catch (Exception ex)
{
Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe);
}
}
foreach (var probe in toUnsubscribe)
{
try
{
await _client.UnsubscribeAsync(probe);
}
catch (Exception ex)
{
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during sync", probe);
}
}
}
/// <summary>
/// Routes an <c>OnTagValueChanged</c> callback to the probe state machine. Returns
/// <see langword="true"/> when <paramref name="tagRef"/> matches a bridge-owned probe
/// (in which case the owning node manager should skip its normal variable-update path).
/// </summary>
public bool HandleProbeUpdate(string tagRef, Vtq vtq)
{
if (_disposed || string.IsNullOrEmpty(tagRef))
return false;
GalaxyRuntimeStatus? status;
int fromToGobjectId = 0;
GalaxyRuntimeState? transitionTo = null;
lock (_lock)
{
if (!_byProbe.TryGetValue(tagRef, out status))
return false; // not a probe — let the caller handle it normally
var now = _clock();
var isRunning = vtq.Quality.IsGood() && vtq.Value is bool b && b;
status.LastStateCallbackTime = now;
status.LastScanState = vtq.Value as bool?;
if (isRunning)
{
status.GoodUpdateCount++;
status.LastError = null;
if (status.State != GalaxyRuntimeState.Running)
{
status.State = GalaxyRuntimeState.Running;
status.LastStateChangeTime = now;
transitionTo = GalaxyRuntimeState.Running;
fromToGobjectId = status.GobjectId;
}
}
else
{
status.FailureCount++;
status.LastError = BuildErrorDetail(vtq);
if (status.State != GalaxyRuntimeState.Stopped)
{
status.State = GalaxyRuntimeState.Stopped;
status.LastStateChangeTime = now;
transitionTo = GalaxyRuntimeState.Stopped;
fromToGobjectId = status.GobjectId;
}
}
}
// Invoke transition callbacks outside the lock to avoid inverting the node manager's
// lock order when it subsequently takes its own Lock to flip hosted variables.
if (transitionTo == GalaxyRuntimeState.Stopped)
{
Log.Information("Galaxy runtime {Probe} transitioned Running → Stopped ({Err})",
tagRef, status?.LastError ?? "(no detail)");
try { _onHostStopped?.Invoke(fromToGobjectId); }
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw for {Probe}", tagRef); }
}
else if (transitionTo == GalaxyRuntimeState.Running)
{
Log.Information("Galaxy runtime {Probe} transitioned → Running", tagRef);
try { _onHostRunning?.Invoke(fromToGobjectId); }
catch (Exception ex) { Log.Warning(ex, "onHostRunning callback threw for {Probe}", tagRef); }
}
return true;
}
/// <summary>
/// Periodic tick — flips Unknown entries to Stopped once their registration has been
/// outstanding for longer than the configured timeout without ever receiving a first
/// callback. Does nothing to Running or Stopped entries.
/// </summary>
public void Tick()
{
if (_disposed)
return;
var transitions = new List<int>();
lock (_lock)
{
var now = _clock();
foreach (var entry in _byProbe.Values)
{
if (entry.State != GalaxyRuntimeState.Unknown)
continue;
// LastStateChangeTime is set at creation to "now" so the timeout is measured
// from when the probe was advised.
if (entry.LastStateChangeTime.HasValue
&& now - entry.LastStateChangeTime.Value > _unknownTimeout)
{
entry.State = GalaxyRuntimeState.Stopped;
entry.LastStateChangeTime = now;
entry.FailureCount++;
entry.LastError = "Probe never received an initial callback within the unknown-resolution timeout";
transitions.Add(entry.GobjectId);
}
}
}
foreach (var gobjectId in transitions)
{
Log.Warning("Galaxy runtime gobject {GobjectId} timed out in Unknown state → Stopped", gobjectId);
try { _onHostStopped?.Invoke(gobjectId); }
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw during tick for {GobjectId}", gobjectId); }
}
}
/// <summary>
/// Returns a read-only snapshot of every tracked host. When the MxAccess transport is
/// disconnected, every entry is rewritten to Unknown on the way out so operators aren't
/// misled by cached per-host state — the Connection panel is the primary signal in that
/// case. The underlying <c>_byProbe</c> map is not modified.
/// </summary>
public IReadOnlyList<GalaxyRuntimeStatus> GetSnapshot()
{
var transportDown = _client.State != ConnectionState.Connected;
lock (_lock)
{
var result = new List<GalaxyRuntimeStatus>(_byProbe.Count);
foreach (var entry in _byProbe.Values)
result.Add(Clone(entry, forceUnknown: transportDown));
// Stable ordering by name so dashboard rows don't jitter between refreshes.
result.Sort((a, b) => string.CompareOrdinal(a.ObjectName, b.ObjectName));
return result;
}
}
/// <inheritdoc />
public void Dispose()
{
List<string> probes;
lock (_lock)
{
if (_disposed)
return;
_disposed = true;
probes = _byProbe.Keys.ToList();
_byProbe.Clear();
_probeByGobjectId.Clear();
}
foreach (var probe in probes)
{
try
{
_client.UnsubscribeAsync(probe).GetAwaiter().GetResult();
}
catch (Exception ex)
{
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during Dispose", probe);
}
}
}
private void OnProbeValueChanged(string tagRef, Vtq vtq)
{
HandleProbeUpdate(tagRef, vtq);
}
private GalaxyRuntimeStatus MakeInitialStatus(GalaxyObjectInfo obj, string kind)
{
return new GalaxyRuntimeStatus
{
ObjectName = obj.TagName,
GobjectId = obj.GobjectId,
Kind = kind,
State = GalaxyRuntimeState.Unknown,
LastStateChangeTime = _clock()
};
}
private static GalaxyRuntimeStatus Clone(GalaxyRuntimeStatus src, bool forceUnknown)
{
return new GalaxyRuntimeStatus
{
ObjectName = src.ObjectName,
GobjectId = src.GobjectId,
Kind = src.Kind,
State = forceUnknown ? GalaxyRuntimeState.Unknown : src.State,
LastStateCallbackTime = src.LastStateCallbackTime,
LastStateChangeTime = src.LastStateChangeTime,
LastScanState = src.LastScanState,
LastError = forceUnknown ? null : src.LastError,
GoodUpdateCount = src.GoodUpdateCount,
FailureCount = src.FailureCount
};
}
private static string BuildErrorDetail(Vtq vtq)
{
if (vtq.Quality.IsBad())
return $"bad quality ({vtq.Quality})";
if (vtq.Quality.IsUncertain())
return $"uncertain quality ({vtq.Quality})";
if (vtq.Value is bool b && !b)
return "ScanState = false (OffScan)";
return $"unexpected value: {vtq.Value ?? "(null)"}";
}
}
}

View File

@@ -10,6 +10,7 @@ using Serilog;
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
using ZB.MOM.WW.LmxOpcUa.Host.MxAccess;
namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
{
@@ -32,6 +33,19 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private readonly AlarmObjectFilter? _alarmObjectFilter;
private int _alarmFilterIncludedObjectCount;
private readonly bool _anonymousCanWrite;
// Host → list of OPC UA variable nodes transitively hosted by that host. Populated during
// BuildAddressSpace by walking each variable's owning object's hosted_by_gobject_id chain
// up to the nearest $WinPlatform or $AppEngine. A variable that lives under a nested host
// (e.g. a user object under an Engine under a Platform) appears in BOTH the Engine's and
// the Platform's list. Used by MarkHostVariablesBadQuality / ClearHostVariablesBadQuality
// when the galaxy runtime probe reports a host transition.
private readonly Dictionary<int, List<BaseDataVariableState>> _hostedVariables =
new Dictionary<int, List<BaseDataVariableState>>();
// Runtime status probe manager — null when MxAccessConfiguration.RuntimeStatusProbesEnabled
// is false. Built at construction time and synced to the hierarchy on every BuildAddressSpace.
private readonly GalaxyRuntimeProbeManager? _galaxyRuntimeProbeManager;
private readonly AutoResetEvent _dataChangeSignal = new(false);
private readonly Dictionary<int, List<string>> _gobjectToTagRefs = new();
private readonly HistoryContinuationPointManager _historyContinuations = new();
@@ -106,7 +120,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
NodeId? writeTuneRoleId = null,
NodeId? writeConfigureRoleId = null,
NodeId? alarmAckRoleId = null,
AlarmObjectFilter? alarmObjectFilter = null)
AlarmObjectFilter? alarmObjectFilter = null,
bool runtimeStatusProbesEnabled = false,
int runtimeStatusUnknownTimeoutSeconds = 15)
: base(server, configuration, namespaceUri)
{
_namespaceUri = namespaceUri;
@@ -121,6 +137,15 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_writeConfigureRoleId = writeConfigureRoleId;
_alarmAckRoleId = alarmAckRoleId;
if (runtimeStatusProbesEnabled)
{
_galaxyRuntimeProbeManager = new GalaxyRuntimeProbeManager(
_mxAccessClient,
runtimeStatusUnknownTimeoutSeconds,
MarkHostVariablesBadQuality,
ClearHostVariablesBadQuality);
}
// Wire up data change delivery
_mxAccessClient.OnTagValueChanged += OnMxAccessDataChange;
@@ -190,6 +215,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
public IReadOnlyList<string> AlarmFilterPatterns =>
_alarmObjectFilter?.RawPatterns ?? Array.Empty<string>();
/// <summary>
/// Gets a snapshot of the runtime host states (Platforms + AppEngines). Returns an empty
/// list when runtime status probing is disabled. The snapshot respects MxAccess transport
/// state — when the client is disconnected, every entry is returned as
/// <see cref="GalaxyRuntimeState.Unknown"/>.
/// </summary>
public IReadOnlyList<GalaxyRuntimeStatus> RuntimeStatuses =>
_galaxyRuntimeProbeManager?.GetSnapshot() ?? (IReadOnlyList<GalaxyRuntimeStatus>)Array.Empty<GalaxyRuntimeStatus>();
/// <summary>
/// Gets the number of bridge-owned runtime status probe subscriptions. Surfaced on the
/// dashboard Subscriptions panel to distinguish probe overhead from client subscriptions.
/// </summary>
public int ActiveRuntimeProbeCount => _galaxyRuntimeProbeManager?.ActiveProbeCount ?? 0;
/// <summary>
/// Gets the runtime historian health snapshot, or <see langword="null"/> when the historian
/// plugin is not loaded. Surfaced on the status dashboard so operators can detect query
@@ -261,6 +301,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_alarmDescTags.Clear();
_nodeMap.Clear();
_gobjectToTagRefs.Clear();
_hostedVariables.Clear();
VariableNodeCount = 0;
ObjectNodeCount = 0;
@@ -464,12 +505,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (_alarmTrackingEnabled)
SubscribeAlarmTags();
BuildHostedVariablesMap(hierarchy);
// Sync the galaxy runtime probe set against the rebuilt hierarchy. This runs
// synchronously on the calling thread and issues AdviseSupervisory per host —
// expected 500ms-1s additional startup latency for a large multi-host galaxy.
_galaxyRuntimeProbeManager?.SyncAsync(hierarchy).GetAwaiter().GetResult();
_lastHierarchy = new List<GalaxyObjectInfo>(hierarchy);
_lastAttributes = new List<GalaxyAttributeInfo>(attributes);
Log.Information(
"Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags",
ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count);
"Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags, {Hosts} runtime hosts",
ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count,
_hostedVariables.Count);
}
}
@@ -499,6 +548,120 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
return includedIds;
}
/// <summary>
/// Builds the <c>_hostedVariables</c> dictionary from the completed address space. For each
/// Galaxy object, walks its <c>HostedByGobjectId</c> chain up to the nearest <c>$WinPlatform</c>
/// or <c>$AppEngine</c> and appends every variable the object owns to that host's list. An
/// object under an Engine under a Platform appears in BOTH lists so stopping the Platform
/// invalidates every descendant Engine's variables as well.
/// </summary>
private void BuildHostedVariablesMap(List<GalaxyObjectInfo> hierarchy)
{
_hostedVariables.Clear();
if (hierarchy == null || hierarchy.Count == 0)
return;
var byId = new Dictionary<int, GalaxyObjectInfo>(hierarchy.Count);
foreach (var obj in hierarchy)
byId[obj.GobjectId] = obj;
foreach (var obj in hierarchy)
{
if (!_gobjectToTagRefs.TryGetValue(obj.GobjectId, out var tagRefs) || tagRefs.Count == 0)
continue;
// Collect every variable node owned by this object from the tag→variable map.
var ownedVariables = new List<BaseDataVariableState>(tagRefs.Count);
foreach (var tagRef in tagRefs)
if (_tagToVariableNode.TryGetValue(tagRef, out var v))
ownedVariables.Add(v);
if (ownedVariables.Count == 0)
continue;
// Walk HostedByGobjectId up the chain, appending to every Platform/Engine encountered.
// Visited set defends against cycles in misconfigured galaxies.
var visited = new HashSet<int>();
var cursor = obj;
var depth = 0;
while (cursor != null && depth < 32 && visited.Add(cursor.GobjectId))
{
if (cursor.CategoryId == 1 || cursor.CategoryId == 3)
{
if (!_hostedVariables.TryGetValue(cursor.GobjectId, out var list))
{
list = new List<BaseDataVariableState>();
_hostedVariables[cursor.GobjectId] = list;
}
list.AddRange(ownedVariables);
}
if (cursor.HostedByGobjectId == 0 ||
!byId.TryGetValue(cursor.HostedByGobjectId, out var next))
break;
cursor = next;
depth++;
}
}
}
/// <summary>
/// Flips every OPC UA variable hosted by the given Galaxy runtime object (Platform or
/// AppEngine) to <see cref="StatusCodes.BadOutOfService"/>. Invoked by the runtime probe
/// manager's Running → Stopped callback. Safe to call with an unknown gobject id — no-op.
/// </summary>
/// <param name="gobjectId">The runtime host's gobject_id.</param>
public void MarkHostVariablesBadQuality(int gobjectId)
{
List<BaseDataVariableState>? variables;
lock (Lock)
{
if (!_hostedVariables.TryGetValue(gobjectId, out variables))
return;
var now = DateTime.UtcNow;
foreach (var variable in variables)
{
variable.StatusCode = StatusCodes.BadOutOfService;
variable.Timestamp = now;
variable.ClearChangeMasks(SystemContext, false);
}
}
Log.Information(
"Marked {Count} variable(s) BadOutOfService for stopped host gobject_id={GobjectId}",
variables.Count, gobjectId);
}
/// <summary>
/// Resets every OPC UA variable hosted by the given Galaxy runtime object to
/// <see cref="StatusCodes.Good"/>. Invoked by the runtime probe manager's Stopped → Running
/// callback. Values are left as-is; subsequent MxAccess on-change updates will refresh them
/// as tags change naturally.
/// </summary>
/// <param name="gobjectId">The runtime host's gobject_id.</param>
public void ClearHostVariablesBadQuality(int gobjectId)
{
List<BaseDataVariableState>? variables;
lock (Lock)
{
if (!_hostedVariables.TryGetValue(gobjectId, out variables))
return;
var now = DateTime.UtcNow;
foreach (var variable in variables)
{
variable.StatusCode = StatusCodes.Good;
variable.Timestamp = now;
variable.ClearChangeMasks(SystemContext, false);
}
}
Log.Information(
"Cleared bad-quality override on {Count} variable(s) for recovered host gobject_id={GobjectId}",
variables.Count, gobjectId);
}
private void SubscribeAlarmTags()
{
foreach (var kvp in _alarmInAlarmTags)
@@ -2116,6 +2279,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (_dispatchDisposed)
return;
// Runtime status probes are bridge-owned subscriptions whose only job is to drive the
// host state machine; they are NOT in _tagToVariableNode, so the normal dispatch path
// would drop them anyway. Route probe addresses directly to the probe manager and skip
// the dispatch queue entirely.
if (_galaxyRuntimeProbeManager != null
&& _galaxyRuntimeProbeManager.HandleProbeUpdate(address, vtq))
return;
Interlocked.Increment(ref _totalMxChangeEvents);
_pendingDataChanges[address] = vtq;
try
@@ -2162,6 +2333,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (!_dispatchRunning)
break;
// Drive time-based probe state transitions on every dispatch tick. The dispatch
// loop already wakes every 100ms via the WaitOne timeout, so this gives us a
// ~10Hz cadence for the Unknown → Stopped timeout without introducing a new
// thread or timer. No-op when the probe manager is disabled.
_galaxyRuntimeProbeManager?.Tick();
var keys = _pendingDataChanges.Keys.ToList();
if (keys.Count == 0)
{
@@ -2376,6 +2553,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
{
_dispatchDisposed = true;
_mxAccessClient.OnTagValueChanged -= OnMxAccessDataChange;
// Dispose the runtime probe manager before the MxAccess client teardown so its
// Unadvise calls reach a live client. Disposing the node manager normally runs
// BEFORE the node manager's containing OpcUaServerHost releases the MxAccess
// client, so the probes close cleanly.
_galaxyRuntimeProbeManager?.Dispose();
StopDispatchThread();
_dataChangeSignal.Dispose();
}

View File

@@ -37,11 +37,16 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private NodeId? _writeOperateRoleId;
private NodeId? _writeTuneRoleId;
private readonly bool _runtimeStatusProbesEnabled;
private readonly int _runtimeStatusUnknownTimeoutSeconds;
public LmxOpcUaServer(string galaxyName, IMxAccessClient mxAccessClient, PerformanceMetrics metrics,
IHistorianDataSource? historianDataSource = null, bool alarmTrackingEnabled = false,
AuthenticationConfiguration? authConfig = null, IUserAuthenticationProvider? authProvider = null,
RedundancyConfiguration? redundancyConfig = null, string? applicationUri = null,
AlarmObjectFilter? alarmObjectFilter = null)
AlarmObjectFilter? alarmObjectFilter = null,
bool runtimeStatusProbesEnabled = false,
int runtimeStatusUnknownTimeoutSeconds = 15)
{
_galaxyName = galaxyName;
_mxAccessClient = mxAccessClient;
@@ -53,6 +58,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_authProvider = authProvider;
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
_applicationUri = applicationUri;
_runtimeStatusProbesEnabled = runtimeStatusProbesEnabled;
_runtimeStatusUnknownTimeoutSeconds = runtimeStatusUnknownTimeoutSeconds;
}
/// <summary>
@@ -89,7 +96,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
NodeManager = new LmxNodeManager(server, configuration, namespaceUri, _mxAccessClient, _metrics,
_historianDataSource, _alarmTrackingEnabled, _authConfig.AnonymousCanWrite,
_writeOperateRoleId, _writeTuneRoleId, _writeConfigureRoleId, _alarmAckRoleId,
_alarmObjectFilter);
_alarmObjectFilter,
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds);
var nodeManagers = new List<INodeManager> { NodeManager };
return new MasterNodeManager(server, configuration, null, nodeManagers.ToArray());

View File

@@ -45,7 +45,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
IUserAuthenticationProvider? authProvider = null,
SecurityProfileConfiguration? securityConfig = null,
RedundancyConfiguration? redundancyConfig = null,
AlarmObjectFilter? alarmObjectFilter = null)
AlarmObjectFilter? alarmObjectFilter = null,
MxAccessConfiguration? mxAccessConfig = null)
{
_config = config;
_mxAccessClient = mxAccessClient;
@@ -56,8 +57,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_securityConfig = securityConfig ?? new SecurityProfileConfiguration();
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
_alarmObjectFilter = alarmObjectFilter;
_mxAccessConfig = mxAccessConfig ?? new MxAccessConfiguration();
}
private readonly MxAccessConfiguration _mxAccessConfig;
/// <summary>
/// Gets the active node manager that holds the published Galaxy namespace.
/// </summary>
@@ -239,7 +243,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_server = new LmxOpcUaServer(_config.GalaxyName, _mxAccessClient, _metrics, _historianDataSource,
_config.AlarmTrackingEnabled, _authConfig, _authProvider, _redundancyConfig, applicationUri,
_alarmObjectFilter);
_alarmObjectFilter,
_mxAccessConfig.RuntimeStatusProbesEnabled,
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds);
await _application.Start(_server);
Log.Information(

View File

@@ -245,7 +245,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
string.Join(", ", _config.OpcUa.AlarmFilter.ObjectFilters));
ServerHost = new OpcUaServerHost(_config.OpcUa, effectiveMxClient, Metrics, _historianDataSource,
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter);
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter,
_config.MxAccess);
// Step 9-10: Query hierarchy, start server, build address space
DateTime? initialDeployTime = null;

View File

@@ -1,3 +1,4 @@
using System.Linq;
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
@@ -21,7 +22,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
ConnectionState connectionState,
PerformanceMetrics? metrics,
HistorianStatusInfo? historian = null,
AlarmStatusInfo? alarms = null)
AlarmStatusInfo? alarms = null,
RuntimeStatusInfo? runtime = null)
{
// Rule 1: Not connected → Unhealthy
if (connectionState != ConnectionState.Connected)
@@ -98,6 +100,23 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
Color = "yellow"
};
// Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
// Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
// Rule 1 without also firing the runtime rule — avoids a double-message when the
// transport is the root cause of every host going Unknown/Stopped.
if (runtime != null && runtime.StoppedCount > 0)
{
var stoppedNames = string.Join(", ",
runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
return new HealthInfo
{
Status = "Degraded",
Message =
$"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
Color = "yellow"
};
}
// Rule 3: All good
return new HealthInfo
{

View File

@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
namespace ZB.MOM.WW.LmxOpcUa.Host.Status
@@ -59,12 +60,49 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
/// </summary>
public EndpointsInfo Endpoints { get; set; } = new();
/// <summary>
/// Gets or sets the Galaxy runtime host state (Platforms + AppEngines).
/// </summary>
public RuntimeStatusInfo RuntimeStatus { get; set; } = new();
/// <summary>
/// Gets or sets footer details such as the snapshot timestamp and service version.
/// </summary>
public FooterInfo Footer { get; set; } = new();
}
/// <summary>
/// Dashboard model summarizing per-host Galaxy runtime state.
/// </summary>
public class RuntimeStatusInfo
{
/// <summary>
/// Gets or sets the total number of tracked runtime hosts ($WinPlatform + $AppEngine).
/// </summary>
public int Total { get; set; }
/// <summary>
/// Gets or sets the count of hosts currently reported Running.
/// </summary>
public int RunningCount { get; set; }
/// <summary>
/// Gets or sets the count of hosts currently reported Stopped.
/// </summary>
public int StoppedCount { get; set; }
/// <summary>
/// Gets or sets the count of hosts whose state is still Unknown (either awaiting initial
/// probe resolution or transported-through-disconnected).
/// </summary>
public int UnknownCount { get; set; }
/// <summary>
/// Gets or sets the per-host state in stable alphabetical order.
/// </summary>
public List<GalaxyRuntimeStatus> Hosts { get; set; } = new();
}
/// <summary>
/// Dashboard model describing the OPC UA server's listening endpoints and active security profiles.
/// </summary>
@@ -156,8 +194,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
{
/// <summary>
/// Gets or sets the number of active tag subscriptions mirrored from MXAccess into OPC UA.
/// This total includes bridge-owned runtime status probes; see <see cref="ProbeCount"/> for the
/// subset attributable to probes.
/// </summary>
public int ActiveCount { get; set; }
/// <summary>
/// Gets or sets the count of bridge-owned runtime status probes included in
/// <see cref="ActiveCount"/>. Surfaced on the dashboard so operators can distinguish probe
/// overhead from client-driven subscription load.
/// </summary>
public int ProbeCount { get; set; }
}
/// <summary>

View File

@@ -88,10 +88,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
ReconnectCount = _mxAccessClient?.ReconnectCount ?? 0,
ActiveSessions = _serverHost?.ActiveSessionCount ?? 0
},
Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo),
Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo, BuildRuntimeStatusInfo()),
Subscriptions = new SubscriptionInfo
{
ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0
ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0,
ProbeCount = _nodeManager?.ActiveRuntimeProbeCount ?? 0
},
Galaxy = new GalaxyInfo
{
@@ -114,6 +115,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
Alarms = alarmInfo,
Redundancy = BuildRedundancyInfo(),
Endpoints = BuildEndpointsInfo(),
RuntimeStatus = BuildRuntimeStatusInfo(),
Footer = new FooterInfo
{
Timestamp = DateTime.UtcNow,
@@ -192,6 +194,26 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
return info;
}
private RuntimeStatusInfo BuildRuntimeStatusInfo()
{
var hosts = _nodeManager?.RuntimeStatuses?.ToList() ?? new List<GalaxyRuntimeStatus>();
var info = new RuntimeStatusInfo
{
Total = hosts.Count,
Hosts = hosts
};
foreach (var host in hosts)
{
switch (host.State)
{
case GalaxyRuntimeState.Running: info.RunningCount++; break;
case GalaxyRuntimeState.Stopped: info.StoppedCount++; break;
default: info.UnknownCount++; break;
}
}
return info;
}
private RedundancyInfo? BuildRedundancyInfo()
{
if (_redundancyConfig == null || !_redundancyConfig.Enabled)
@@ -300,7 +322,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
// Subscriptions panel
sb.AppendLine("<div class='panel gray'><h2>Subscriptions</h2>");
sb.AppendLine($"<p>Active: {data.Subscriptions.ActiveCount}</p>");
sb.AppendLine($"<p>Active: <b>{data.Subscriptions.ActiveCount}</b></p>");
if (data.Subscriptions.ProbeCount > 0)
sb.AppendLine(
$"<p>Probes: {data.Subscriptions.ProbeCount} (bridge-owned runtime status)</p>");
sb.AppendLine("</div>");
// Data Change Dispatch panel
@@ -318,6 +343,32 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
sb.AppendLine($"<p>Last Rebuild: {data.Galaxy.LastRebuildTime:O}</p>");
sb.AppendLine("</div>");
// Galaxy Runtime panel — per-host Platform + AppEngine state
if (data.RuntimeStatus.Total > 0)
{
var rtColor = data.RuntimeStatus.StoppedCount > 0 ? "red"
: data.RuntimeStatus.UnknownCount > 0 ? "yellow"
: "green";
sb.AppendLine($"<div class='panel {rtColor}'><h2>Galaxy Runtime</h2>");
sb.AppendLine(
$"<p>{data.RuntimeStatus.RunningCount} of {data.RuntimeStatus.Total} hosts running" +
$" ({data.RuntimeStatus.StoppedCount} stopped, {data.RuntimeStatus.UnknownCount} unknown)</p>");
sb.AppendLine("<table><tr><th>Name</th><th>Kind</th><th>State</th><th>Since</th><th>Last Error</th></tr>");
foreach (var host in data.RuntimeStatus.Hosts)
{
var since = host.LastStateChangeTime?.ToString("O") ?? "-";
var err = WebUtility.HtmlEncode(host.LastError ?? "");
sb.AppendLine(
$"<tr><td>{WebUtility.HtmlEncode(host.ObjectName)}</td>" +
$"<td>{WebUtility.HtmlEncode(host.Kind)}</td>" +
$"<td>{host.State}</td>" +
$"<td>{since}</td>" +
$"<td><code>{err}</code></td></tr>");
}
sb.AppendLine("</table>");
sb.AppendLine("</div>");
}
// Historian panel
var anyClusterNodeFailed =
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount;

View File

@@ -23,7 +23,9 @@
"MonitorIntervalSeconds": 5,
"AutoReconnect": true,
"ProbeTag": null,
"ProbeStaleThresholdSeconds": 60
"ProbeStaleThresholdSeconds": 60,
"RuntimeStatusProbesEnabled": true,
"RuntimeStatusUnknownTimeoutSeconds": 15
},
"GalaxyRepository": {
"ConnectionString": "Server=localhost;Database=ZB;Integrated Security=true;",