Track Galaxy Platform and AppEngine runtime state via ScanState probes and proactively invalidate descendant variable quality on Stopped transitions so operators can detect a stopped runtime host before downstream clients read stale data and so the bridge delivers a uniform bad-quality signal instead of relying on MxAccess per-tag fan-out
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
"MxAccess.MonitorInterval={MonitorInterval}s, AutoReconnect={AutoReconnect}, ProbeTag={ProbeTag}, ProbeStaleThreshold={ProbeStale}s",
|
||||
config.MxAccess.MonitorIntervalSeconds, config.MxAccess.AutoReconnect,
|
||||
config.MxAccess.ProbeTag ?? "(none)", config.MxAccess.ProbeStaleThresholdSeconds);
|
||||
Log.Information(
|
||||
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s",
|
||||
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(config.MxAccess.ClientName))
|
||||
{
|
||||
@@ -80,6 +83,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if (config.MxAccess.RuntimeStatusUnknownTimeoutSeconds < 5)
|
||||
Log.Warning(
|
||||
"MxAccess.RuntimeStatusUnknownTimeoutSeconds={Timeout} is below the recommended floor of 5s; initial probe resolution may time out before MxAccess has delivered the first callback",
|
||||
config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
|
||||
|
||||
// Galaxy Repository
|
||||
Log.Information(
|
||||
"GalaxyRepository.ConnectionString={ConnectionString}, ChangeDetectionInterval={ChangeInterval}s, CommandTimeout={CmdTimeout}s, ExtendedAttributes={ExtendedAttributes}",
|
||||
|
||||
@@ -55,5 +55,22 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
/// Gets or sets the number of seconds a probe value may remain unchanged before the connection is considered stale.
|
||||
/// </summary>
|
||||
public int ProbeStaleThresholdSeconds { get; set; } = 60;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets a value indicating whether the bridge advises <c><ObjectName>.ScanState</c> for every
|
||||
/// deployed <c>$WinPlatform</c> and <c>$AppEngine</c>, reporting per-host runtime state on the status
|
||||
/// dashboard and proactively invalidating OPC UA variable quality when a host transitions to Stopped.
|
||||
/// Enabled by default. Disable to return to legacy behavior where host runtime state is invisible and
|
||||
/// MxAccess's per-tag bad-quality fan-out is the only stop signal.
|
||||
/// </summary>
|
||||
public bool RuntimeStatusProbesEnabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the maximum seconds to wait for the initial probe callback before marking a host as
|
||||
/// Stopped. Only applies to the Unknown → Stopped transition. Because <c>ScanState</c> is delivered
|
||||
/// on-change only, a stably Running host does not time out — no starvation check runs on Running
|
||||
/// entries. Default 15s.
|
||||
/// </summary>
|
||||
public int RuntimeStatusUnknownTimeoutSeconds { get; set; } = 15;
|
||||
}
|
||||
}
|
||||
@@ -44,5 +44,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
|
||||
/// <see cref="AlarmObjectFilter"/> to decide whether an object's alarms should be monitored.
|
||||
/// </summary>
|
||||
public List<string> TemplateChain { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy template category id for this object. Category 1 is $WinPlatform,
|
||||
/// 3 is $AppEngine, 13 is $Area, 10 is $UserDefined, and so on. Populated from
|
||||
/// <c>template_definition.category_id</c> by <c>hierarchy.sql</c> and consumed by the runtime
|
||||
/// status probe manager to identify hosts that should receive a <c>ScanState</c> probe.
|
||||
/// </summary>
|
||||
public int CategoryId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy object id of this object's runtime host, populated from
|
||||
/// <c>gobject.hosted_by_gobject_id</c>. Walk this chain upward to find the nearest
|
||||
/// <c>$WinPlatform</c> or <c>$AppEngine</c> ancestor for subtree quality invalidation when
|
||||
/// a runtime host is reported Stopped. Zero for root objects that have no host.
|
||||
/// </summary>
|
||||
public int HostedByGobjectId { get; set; }
|
||||
}
|
||||
}
|
||||
29
src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs
Normal file
29
src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs
Normal file
@@ -0,0 +1,29 @@
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
|
||||
{
|
||||
/// <summary>
|
||||
/// Runtime state of a deployed Galaxy runtime host ($WinPlatform or $AppEngine) as
|
||||
/// observed by the bridge via its <c>ScanState</c> probe.
|
||||
/// </summary>
|
||||
public enum GalaxyRuntimeState
|
||||
{
|
||||
/// <summary>
|
||||
/// Probe advised but no callback received yet. Transitions to <see cref="Running"/>
|
||||
/// on the first successful <c>ScanState = true</c> callback, or to <see cref="Stopped"/>
|
||||
/// once the unknown-resolution timeout elapses.
|
||||
/// </summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>
|
||||
/// Last probe callback reported <c>ScanState = true</c> with a successful item status.
|
||||
/// The host is on scan and executing.
|
||||
/// </summary>
|
||||
Running,
|
||||
|
||||
/// <summary>
|
||||
/// Last probe callback reported <c>ScanState != true</c>, or a failed item status, or
|
||||
/// the initial probe never resolved before the unknown timeout elapsed. The host is
|
||||
/// off scan or unreachable.
|
||||
/// </summary>
|
||||
Stopped
|
||||
}
|
||||
}
|
||||
72
src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs
Normal file
72
src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs
Normal file
@@ -0,0 +1,72 @@
|
||||
using System;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Domain
|
||||
{
|
||||
/// <summary>
|
||||
/// Point-in-time runtime state of a single Galaxy runtime host ($WinPlatform or $AppEngine)
|
||||
/// as tracked by the <c>GalaxyRuntimeProbeManager</c>. Surfaced on the status dashboard and
|
||||
/// consumed by <c>HealthCheckService</c> so operators can detect a stopped host before
|
||||
/// downstream clients notice the stale data.
|
||||
/// </summary>
|
||||
public sealed class GalaxyRuntimeStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy tag_name of the host (e.g., <c>DevPlatform</c> or
|
||||
/// <c>DevAppEngine</c>).
|
||||
/// </summary>
|
||||
public string ObjectName { get; set; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy gobject_id of the host.
|
||||
/// </summary>
|
||||
public int GobjectId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy template category name — <c>$WinPlatform</c> or
|
||||
/// <c>$AppEngine</c>. Used by the dashboard to group hosts by kind.
|
||||
/// </summary>
|
||||
public string Kind { get; set; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the current runtime state.
|
||||
/// </summary>
|
||||
public GalaxyRuntimeState State { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the most recent probe callback, whether it
|
||||
/// reported success or failure. <see langword="null"/> before the first callback.
|
||||
/// </summary>
|
||||
public DateTime? LastStateCallbackTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of the most recent <see cref="State"/> transition.
|
||||
/// Backs the dashboard "Since" column. <see langword="null"/> in the initial Unknown
|
||||
/// state before any transition.
|
||||
/// </summary>
|
||||
public DateTime? LastStateChangeTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the last <c>ScanState</c> value received from the probe, or
|
||||
/// <see langword="null"/> before the first update or when the last callback carried
|
||||
/// a non-success item status (no value delivered).
|
||||
/// </summary>
|
||||
public bool? LastScanState { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the detail message from the most recent failure callback, cleared on
|
||||
/// the next successful <c>ScanState = true</c> delivery.
|
||||
/// </summary>
|
||||
public string? LastError { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the cumulative number of callbacks where <c>ScanState = true</c>.
|
||||
/// </summary>
|
||||
public long GoodUpdateCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the cumulative number of callbacks where <c>ScanState != true</c>
|
||||
/// or the item status reported failure.
|
||||
/// </summary>
|
||||
public long FailureCount { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -50,7 +50,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository
|
||||
|
||||
while (await reader.ReadAsync(ct))
|
||||
{
|
||||
var templateChainRaw = reader.IsDBNull(6) ? "" : reader.GetString(6);
|
||||
var templateChainRaw = reader.IsDBNull(8) ? "" : reader.GetString(8);
|
||||
var templateChain = string.IsNullOrEmpty(templateChainRaw)
|
||||
? new List<string>()
|
||||
: templateChainRaw.Split(new[] { '|' }, StringSplitOptions.RemoveEmptyEntries)
|
||||
@@ -66,6 +66,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository
|
||||
BrowseName = reader.GetString(3),
|
||||
ParentGobjectId = Convert.ToInt32(reader.GetValue(4)),
|
||||
IsArea = Convert.ToInt32(reader.GetValue(5)) == 1,
|
||||
CategoryId = Convert.ToInt32(reader.GetValue(6)),
|
||||
HostedByGobjectId = Convert.ToInt32(reader.GetValue(7)),
|
||||
TemplateChain = templateChain
|
||||
});
|
||||
}
|
||||
@@ -234,6 +236,8 @@ SELECT DISTINCT
|
||||
THEN 1
|
||||
ELSE 0
|
||||
END AS is_area,
|
||||
td.category_id AS category_id,
|
||||
g.hosted_by_gobject_id AS hosted_by_gobject_id,
|
||||
ISNULL(
|
||||
STUFF((
|
||||
SELECT '|' + tc.template_tag_name
|
||||
|
||||
@@ -0,0 +1,404 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using Serilog;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
|
||||
{
|
||||
/// <summary>
|
||||
/// Advises <c><ObjectName>.ScanState</c> on every deployed <c>$WinPlatform</c> and
|
||||
/// <c>$AppEngine</c>, tracks their runtime state (Unknown / Running / Stopped), and notifies
|
||||
/// the owning node manager on Running↔Stopped transitions so it can proactively flip every
|
||||
/// OPC UA variable hosted by that object to <c>BadOutOfService</c> (and clear on recovery).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// State machine semantics are documented in <c>runtimestatus.md</c>. Key facts:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>ScanState</c> is delivered on-change only — no periodic heartbeat. A stably
|
||||
/// Running host may go hours without a callback.</item>
|
||||
/// <item>Running → Stopped is driven by explicit error callbacks or <c>ScanState = false</c>,
|
||||
/// NEVER by starvation. The only starvation check applies to the initial Unknown state.</item>
|
||||
/// <item>When the MxAccess transport is disconnected, <see cref="GetSnapshot"/> returns every
|
||||
/// entry with <see cref="GalaxyRuntimeState.Unknown"/> regardless of the underlying state,
|
||||
/// because we can't observe anything through a dead transport.</item>
|
||||
/// <item>The stop/start callbacks fire synchronously from whichever thread delivered the
|
||||
/// probe update. The manager releases its own lock before invoking them to avoid
|
||||
/// lock-inversion deadlocks with the node manager's <c>Lock</c>.</item>
|
||||
/// </list>
|
||||
/// </remarks>
|
||||
public sealed class GalaxyRuntimeProbeManager : IDisposable
|
||||
{
|
||||
private static readonly ILogger Log = Serilog.Log.ForContext<GalaxyRuntimeProbeManager>();
|
||||
|
||||
private const int CategoryWinPlatform = 1;
|
||||
private const int CategoryAppEngine = 3;
|
||||
private const string KindWinPlatform = "$WinPlatform";
|
||||
private const string KindAppEngine = "$AppEngine";
|
||||
private const string ProbeAttribute = ".ScanState";
|
||||
|
||||
private readonly IMxAccessClient _client;
|
||||
private readonly TimeSpan _unknownTimeout;
|
||||
private readonly Action<int>? _onHostStopped;
|
||||
private readonly Action<int>? _onHostRunning;
|
||||
private readonly Func<DateTime> _clock;
|
||||
|
||||
// Key: probe tag reference (e.g. "DevAppEngine.ScanState").
|
||||
// Value: the current runtime status for that host, kept in sync on every probe callback
|
||||
// and queried via GetSnapshot for dashboard rendering.
|
||||
private readonly Dictionary<string, GalaxyRuntimeStatus> _byProbe =
|
||||
new Dictionary<string, GalaxyRuntimeStatus>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Reverse index: gobject_id -> probe tag, so Sync() can diff new/removed hosts efficiently.
|
||||
private readonly Dictionary<int, string> _probeByGobjectId = new Dictionary<int, string>();
|
||||
|
||||
private readonly object _lock = new object();
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new probe manager. <paramref name="onHostStopped"/> and
|
||||
/// <paramref name="onHostRunning"/> are invoked synchronously on Running↔Stopped
|
||||
/// transitions so the owning node manager can invalidate / restore the hosted subtree.
|
||||
/// </summary>
|
||||
public GalaxyRuntimeProbeManager(
|
||||
IMxAccessClient client,
|
||||
int unknownTimeoutSeconds,
|
||||
Action<int>? onHostStopped = null,
|
||||
Action<int>? onHostRunning = null)
|
||||
: this(client, unknownTimeoutSeconds, onHostStopped, onHostRunning, () => DateTime.UtcNow)
|
||||
{
|
||||
}
|
||||
|
||||
internal GalaxyRuntimeProbeManager(
|
||||
IMxAccessClient client,
|
||||
int unknownTimeoutSeconds,
|
||||
Action<int>? onHostStopped,
|
||||
Action<int>? onHostRunning,
|
||||
Func<DateTime> clock)
|
||||
{
|
||||
_client = client ?? throw new ArgumentNullException(nameof(client));
|
||||
_unknownTimeout = TimeSpan.FromSeconds(Math.Max(1, unknownTimeoutSeconds));
|
||||
_onHostStopped = onHostStopped;
|
||||
_onHostRunning = onHostRunning;
|
||||
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of active probe subscriptions. Surfaced on the dashboard Subscriptions
|
||||
/// panel so operators can see bridge-owned probe count separately from the total.
|
||||
/// </summary>
|
||||
public int ActiveProbeCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
return _byProbe.Count;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Diffs the supplied hierarchy against the active probe set, advising new hosts and
|
||||
/// unadvising removed ones. The hierarchy is filtered to runtime host categories
|
||||
/// ($WinPlatform, $AppEngine) — non-host rows are ignored. Idempotent: a second call
|
||||
/// with the same hierarchy performs no Advise / Unadvise work.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Sync is synchronous on MxAccess: <see cref="IMxAccessClient.SubscribeAsync"/> is
|
||||
/// awaited for each new host, so for a galaxy with N runtime hosts the call blocks for
|
||||
/// ~N round-trips. This is acceptable because it only runs during address-space build
|
||||
/// and rebuild, not on the hot path.
|
||||
/// </remarks>
|
||||
public async Task SyncAsync(IReadOnlyList<GalaxyObjectInfo> hierarchy)
|
||||
{
|
||||
if (_disposed || hierarchy == null)
|
||||
return;
|
||||
|
||||
// Filter to runtime hosts and project to the expected probe tag name.
|
||||
var desired = new Dictionary<int, (string Probe, string Kind, GalaxyObjectInfo Obj)>();
|
||||
foreach (var obj in hierarchy)
|
||||
{
|
||||
if (obj.CategoryId != CategoryWinPlatform && obj.CategoryId != CategoryAppEngine)
|
||||
continue;
|
||||
if (string.IsNullOrWhiteSpace(obj.TagName))
|
||||
continue;
|
||||
var probe = obj.TagName + ProbeAttribute;
|
||||
var kind = obj.CategoryId == CategoryWinPlatform ? KindWinPlatform : KindAppEngine;
|
||||
desired[obj.GobjectId] = (probe, kind, obj);
|
||||
}
|
||||
|
||||
// Compute diffs under lock, release lock before issuing SDK calls (which can block).
|
||||
List<string> toSubscribe;
|
||||
List<string> toUnsubscribe;
|
||||
lock (_lock)
|
||||
{
|
||||
toSubscribe = new List<string>();
|
||||
toUnsubscribe = new List<string>();
|
||||
|
||||
foreach (var kvp in desired)
|
||||
{
|
||||
if (_probeByGobjectId.TryGetValue(kvp.Key, out var existingProbe))
|
||||
{
|
||||
// Already tracked: ensure the status entry is aligned (tag rename path is
|
||||
// intentionally not supported — if the probe changed, treat it as remove+add).
|
||||
if (!string.Equals(existingProbe, kvp.Value.Probe, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
toUnsubscribe.Add(existingProbe);
|
||||
_byProbe.Remove(existingProbe);
|
||||
_probeByGobjectId.Remove(kvp.Key);
|
||||
|
||||
toSubscribe.Add(kvp.Value.Probe);
|
||||
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
||||
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
toSubscribe.Add(kvp.Value.Probe);
|
||||
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
||||
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove hosts that are no longer in the desired set.
|
||||
var toRemove = _probeByGobjectId.Keys.Where(id => !desired.ContainsKey(id)).ToList();
|
||||
foreach (var id in toRemove)
|
||||
{
|
||||
var probe = _probeByGobjectId[id];
|
||||
toUnsubscribe.Add(probe);
|
||||
_byProbe.Remove(probe);
|
||||
_probeByGobjectId.Remove(id);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply the diff outside the lock.
|
||||
foreach (var probe in toSubscribe)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _client.SubscribeAsync(probe, OnProbeValueChanged);
|
||||
Log.Information("Galaxy runtime probe advised: {Probe}", probe);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe);
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var probe in toUnsubscribe)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _client.UnsubscribeAsync(probe);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during sync", probe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Routes an <c>OnTagValueChanged</c> callback to the probe state machine. Returns
|
||||
/// <see langword="true"/> when <paramref name="tagRef"/> matches a bridge-owned probe
|
||||
/// (in which case the owning node manager should skip its normal variable-update path).
|
||||
/// </summary>
|
||||
public bool HandleProbeUpdate(string tagRef, Vtq vtq)
|
||||
{
|
||||
if (_disposed || string.IsNullOrEmpty(tagRef))
|
||||
return false;
|
||||
|
||||
GalaxyRuntimeStatus? status;
|
||||
int fromToGobjectId = 0;
|
||||
GalaxyRuntimeState? transitionTo = null;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_byProbe.TryGetValue(tagRef, out status))
|
||||
return false; // not a probe — let the caller handle it normally
|
||||
|
||||
var now = _clock();
|
||||
var isRunning = vtq.Quality.IsGood() && vtq.Value is bool b && b;
|
||||
status.LastStateCallbackTime = now;
|
||||
status.LastScanState = vtq.Value as bool?;
|
||||
|
||||
if (isRunning)
|
||||
{
|
||||
status.GoodUpdateCount++;
|
||||
status.LastError = null;
|
||||
if (status.State != GalaxyRuntimeState.Running)
|
||||
{
|
||||
status.State = GalaxyRuntimeState.Running;
|
||||
status.LastStateChangeTime = now;
|
||||
transitionTo = GalaxyRuntimeState.Running;
|
||||
fromToGobjectId = status.GobjectId;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
status.FailureCount++;
|
||||
status.LastError = BuildErrorDetail(vtq);
|
||||
if (status.State != GalaxyRuntimeState.Stopped)
|
||||
{
|
||||
status.State = GalaxyRuntimeState.Stopped;
|
||||
status.LastStateChangeTime = now;
|
||||
transitionTo = GalaxyRuntimeState.Stopped;
|
||||
fromToGobjectId = status.GobjectId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Invoke transition callbacks outside the lock to avoid inverting the node manager's
|
||||
// lock order when it subsequently takes its own Lock to flip hosted variables.
|
||||
if (transitionTo == GalaxyRuntimeState.Stopped)
|
||||
{
|
||||
Log.Information("Galaxy runtime {Probe} transitioned Running → Stopped ({Err})",
|
||||
tagRef, status?.LastError ?? "(no detail)");
|
||||
try { _onHostStopped?.Invoke(fromToGobjectId); }
|
||||
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw for {Probe}", tagRef); }
|
||||
}
|
||||
else if (transitionTo == GalaxyRuntimeState.Running)
|
||||
{
|
||||
Log.Information("Galaxy runtime {Probe} transitioned → Running", tagRef);
|
||||
try { _onHostRunning?.Invoke(fromToGobjectId); }
|
||||
catch (Exception ex) { Log.Warning(ex, "onHostRunning callback threw for {Probe}", tagRef); }
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Periodic tick — flips Unknown entries to Stopped once their registration has been
|
||||
/// outstanding for longer than the configured timeout without ever receiving a first
|
||||
/// callback. Does nothing to Running or Stopped entries.
|
||||
/// </summary>
|
||||
public void Tick()
|
||||
{
|
||||
if (_disposed)
|
||||
return;
|
||||
|
||||
var transitions = new List<int>();
|
||||
lock (_lock)
|
||||
{
|
||||
var now = _clock();
|
||||
foreach (var entry in _byProbe.Values)
|
||||
{
|
||||
if (entry.State != GalaxyRuntimeState.Unknown)
|
||||
continue;
|
||||
|
||||
// LastStateChangeTime is set at creation to "now" so the timeout is measured
|
||||
// from when the probe was advised.
|
||||
if (entry.LastStateChangeTime.HasValue
|
||||
&& now - entry.LastStateChangeTime.Value > _unknownTimeout)
|
||||
{
|
||||
entry.State = GalaxyRuntimeState.Stopped;
|
||||
entry.LastStateChangeTime = now;
|
||||
entry.FailureCount++;
|
||||
entry.LastError = "Probe never received an initial callback within the unknown-resolution timeout";
|
||||
transitions.Add(entry.GobjectId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var gobjectId in transitions)
|
||||
{
|
||||
Log.Warning("Galaxy runtime gobject {GobjectId} timed out in Unknown state → Stopped", gobjectId);
|
||||
try { _onHostStopped?.Invoke(gobjectId); }
|
||||
catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw during tick for {GobjectId}", gobjectId); }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a read-only snapshot of every tracked host. When the MxAccess transport is
|
||||
/// disconnected, every entry is rewritten to Unknown on the way out so operators aren't
|
||||
/// misled by cached per-host state — the Connection panel is the primary signal in that
|
||||
/// case. The underlying <c>_byProbe</c> map is not modified.
|
||||
/// </summary>
|
||||
public IReadOnlyList<GalaxyRuntimeStatus> GetSnapshot()
|
||||
{
|
||||
var transportDown = _client.State != ConnectionState.Connected;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
var result = new List<GalaxyRuntimeStatus>(_byProbe.Count);
|
||||
foreach (var entry in _byProbe.Values)
|
||||
result.Add(Clone(entry, forceUnknown: transportDown));
|
||||
// Stable ordering by name so dashboard rows don't jitter between refreshes.
|
||||
result.Sort((a, b) => string.CompareOrdinal(a.ObjectName, b.ObjectName));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
List<string> probes;
|
||||
lock (_lock)
|
||||
{
|
||||
if (_disposed)
|
||||
return;
|
||||
_disposed = true;
|
||||
probes = _byProbe.Keys.ToList();
|
||||
_byProbe.Clear();
|
||||
_probeByGobjectId.Clear();
|
||||
}
|
||||
|
||||
foreach (var probe in probes)
|
||||
{
|
||||
try
|
||||
{
|
||||
_client.UnsubscribeAsync(probe).GetAwaiter().GetResult();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during Dispose", probe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void OnProbeValueChanged(string tagRef, Vtq vtq)
|
||||
{
|
||||
HandleProbeUpdate(tagRef, vtq);
|
||||
}
|
||||
|
||||
private GalaxyRuntimeStatus MakeInitialStatus(GalaxyObjectInfo obj, string kind)
|
||||
{
|
||||
return new GalaxyRuntimeStatus
|
||||
{
|
||||
ObjectName = obj.TagName,
|
||||
GobjectId = obj.GobjectId,
|
||||
Kind = kind,
|
||||
State = GalaxyRuntimeState.Unknown,
|
||||
LastStateChangeTime = _clock()
|
||||
};
|
||||
}
|
||||
|
||||
private static GalaxyRuntimeStatus Clone(GalaxyRuntimeStatus src, bool forceUnknown)
|
||||
{
|
||||
return new GalaxyRuntimeStatus
|
||||
{
|
||||
ObjectName = src.ObjectName,
|
||||
GobjectId = src.GobjectId,
|
||||
Kind = src.Kind,
|
||||
State = forceUnknown ? GalaxyRuntimeState.Unknown : src.State,
|
||||
LastStateCallbackTime = src.LastStateCallbackTime,
|
||||
LastStateChangeTime = src.LastStateChangeTime,
|
||||
LastScanState = src.LastScanState,
|
||||
LastError = forceUnknown ? null : src.LastError,
|
||||
GoodUpdateCount = src.GoodUpdateCount,
|
||||
FailureCount = src.FailureCount
|
||||
};
|
||||
}
|
||||
|
||||
private static string BuildErrorDetail(Vtq vtq)
|
||||
{
|
||||
if (vtq.Quality.IsBad())
|
||||
return $"bad quality ({vtq.Quality})";
|
||||
if (vtq.Quality.IsUncertain())
|
||||
return $"uncertain quality ({vtq.Quality})";
|
||||
if (vtq.Value is bool b && !b)
|
||||
return "ScanState = false (OffScan)";
|
||||
return $"unexpected value: {vtq.Value ?? "(null)"}";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ using Serilog;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.MxAccess;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
{
|
||||
@@ -32,6 +33,19 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
private readonly AlarmObjectFilter? _alarmObjectFilter;
|
||||
private int _alarmFilterIncludedObjectCount;
|
||||
private readonly bool _anonymousCanWrite;
|
||||
|
||||
// Host → list of OPC UA variable nodes transitively hosted by that host. Populated during
|
||||
// BuildAddressSpace by walking each variable's owning object's hosted_by_gobject_id chain
|
||||
// up to the nearest $WinPlatform or $AppEngine. A variable that lives under a nested host
|
||||
// (e.g. a user object under an Engine under a Platform) appears in BOTH the Engine's and
|
||||
// the Platform's list. Used by MarkHostVariablesBadQuality / ClearHostVariablesBadQuality
|
||||
// when the galaxy runtime probe reports a host transition.
|
||||
private readonly Dictionary<int, List<BaseDataVariableState>> _hostedVariables =
|
||||
new Dictionary<int, List<BaseDataVariableState>>();
|
||||
|
||||
// Runtime status probe manager — null when MxAccessConfiguration.RuntimeStatusProbesEnabled
|
||||
// is false. Built at construction time and synced to the hierarchy on every BuildAddressSpace.
|
||||
private readonly GalaxyRuntimeProbeManager? _galaxyRuntimeProbeManager;
|
||||
private readonly AutoResetEvent _dataChangeSignal = new(false);
|
||||
private readonly Dictionary<int, List<string>> _gobjectToTagRefs = new();
|
||||
private readonly HistoryContinuationPointManager _historyContinuations = new();
|
||||
@@ -106,7 +120,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
NodeId? writeTuneRoleId = null,
|
||||
NodeId? writeConfigureRoleId = null,
|
||||
NodeId? alarmAckRoleId = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null)
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
bool runtimeStatusProbesEnabled = false,
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15)
|
||||
: base(server, configuration, namespaceUri)
|
||||
{
|
||||
_namespaceUri = namespaceUri;
|
||||
@@ -121,6 +137,15 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_writeConfigureRoleId = writeConfigureRoleId;
|
||||
_alarmAckRoleId = alarmAckRoleId;
|
||||
|
||||
if (runtimeStatusProbesEnabled)
|
||||
{
|
||||
_galaxyRuntimeProbeManager = new GalaxyRuntimeProbeManager(
|
||||
_mxAccessClient,
|
||||
runtimeStatusUnknownTimeoutSeconds,
|
||||
MarkHostVariablesBadQuality,
|
||||
ClearHostVariablesBadQuality);
|
||||
}
|
||||
|
||||
// Wire up data change delivery
|
||||
_mxAccessClient.OnTagValueChanged += OnMxAccessDataChange;
|
||||
|
||||
@@ -190,6 +215,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
public IReadOnlyList<string> AlarmFilterPatterns =>
|
||||
_alarmObjectFilter?.RawPatterns ?? Array.Empty<string>();
|
||||
|
||||
/// <summary>
|
||||
/// Gets a snapshot of the runtime host states (Platforms + AppEngines). Returns an empty
|
||||
/// list when runtime status probing is disabled. The snapshot respects MxAccess transport
|
||||
/// state — when the client is disconnected, every entry is returned as
|
||||
/// <see cref="GalaxyRuntimeState.Unknown"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyList<GalaxyRuntimeStatus> RuntimeStatuses =>
|
||||
_galaxyRuntimeProbeManager?.GetSnapshot() ?? (IReadOnlyList<GalaxyRuntimeStatus>)Array.Empty<GalaxyRuntimeStatus>();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of bridge-owned runtime status probe subscriptions. Surfaced on the
|
||||
/// dashboard Subscriptions panel to distinguish probe overhead from client subscriptions.
|
||||
/// </summary>
|
||||
public int ActiveRuntimeProbeCount => _galaxyRuntimeProbeManager?.ActiveProbeCount ?? 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the runtime historian health snapshot, or <see langword="null"/> when the historian
|
||||
/// plugin is not loaded. Surfaced on the status dashboard so operators can detect query
|
||||
@@ -261,6 +301,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_alarmDescTags.Clear();
|
||||
_nodeMap.Clear();
|
||||
_gobjectToTagRefs.Clear();
|
||||
_hostedVariables.Clear();
|
||||
VariableNodeCount = 0;
|
||||
ObjectNodeCount = 0;
|
||||
|
||||
@@ -464,12 +505,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
if (_alarmTrackingEnabled)
|
||||
SubscribeAlarmTags();
|
||||
|
||||
BuildHostedVariablesMap(hierarchy);
|
||||
|
||||
// Sync the galaxy runtime probe set against the rebuilt hierarchy. This runs
|
||||
// synchronously on the calling thread and issues AdviseSupervisory per host —
|
||||
// expected 500ms-1s additional startup latency for a large multi-host galaxy.
|
||||
_galaxyRuntimeProbeManager?.SyncAsync(hierarchy).GetAwaiter().GetResult();
|
||||
|
||||
_lastHierarchy = new List<GalaxyObjectInfo>(hierarchy);
|
||||
_lastAttributes = new List<GalaxyAttributeInfo>(attributes);
|
||||
|
||||
Log.Information(
|
||||
"Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags",
|
||||
ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count);
|
||||
"Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags, {Hosts} runtime hosts",
|
||||
ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count,
|
||||
_hostedVariables.Count);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -499,6 +548,120 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
return includedIds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the <c>_hostedVariables</c> dictionary from the completed address space. For each
|
||||
/// Galaxy object, walks its <c>HostedByGobjectId</c> chain up to the nearest <c>$WinPlatform</c>
|
||||
/// or <c>$AppEngine</c> and appends every variable the object owns to that host's list. An
|
||||
/// object under an Engine under a Platform appears in BOTH lists so stopping the Platform
|
||||
/// invalidates every descendant Engine's variables as well.
|
||||
/// </summary>
|
||||
private void BuildHostedVariablesMap(List<GalaxyObjectInfo> hierarchy)
|
||||
{
|
||||
_hostedVariables.Clear();
|
||||
if (hierarchy == null || hierarchy.Count == 0)
|
||||
return;
|
||||
|
||||
var byId = new Dictionary<int, GalaxyObjectInfo>(hierarchy.Count);
|
||||
foreach (var obj in hierarchy)
|
||||
byId[obj.GobjectId] = obj;
|
||||
|
||||
foreach (var obj in hierarchy)
|
||||
{
|
||||
if (!_gobjectToTagRefs.TryGetValue(obj.GobjectId, out var tagRefs) || tagRefs.Count == 0)
|
||||
continue;
|
||||
|
||||
// Collect every variable node owned by this object from the tag→variable map.
|
||||
var ownedVariables = new List<BaseDataVariableState>(tagRefs.Count);
|
||||
foreach (var tagRef in tagRefs)
|
||||
if (_tagToVariableNode.TryGetValue(tagRef, out var v))
|
||||
ownedVariables.Add(v);
|
||||
|
||||
if (ownedVariables.Count == 0)
|
||||
continue;
|
||||
|
||||
// Walk HostedByGobjectId up the chain, appending to every Platform/Engine encountered.
|
||||
// Visited set defends against cycles in misconfigured galaxies.
|
||||
var visited = new HashSet<int>();
|
||||
var cursor = obj;
|
||||
var depth = 0;
|
||||
while (cursor != null && depth < 32 && visited.Add(cursor.GobjectId))
|
||||
{
|
||||
if (cursor.CategoryId == 1 || cursor.CategoryId == 3)
|
||||
{
|
||||
if (!_hostedVariables.TryGetValue(cursor.GobjectId, out var list))
|
||||
{
|
||||
list = new List<BaseDataVariableState>();
|
||||
_hostedVariables[cursor.GobjectId] = list;
|
||||
}
|
||||
list.AddRange(ownedVariables);
|
||||
}
|
||||
|
||||
if (cursor.HostedByGobjectId == 0 ||
|
||||
!byId.TryGetValue(cursor.HostedByGobjectId, out var next))
|
||||
break;
|
||||
cursor = next;
|
||||
depth++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Flips every OPC UA variable hosted by the given Galaxy runtime object (Platform or
|
||||
/// AppEngine) to <see cref="StatusCodes.BadOutOfService"/>. Invoked by the runtime probe
|
||||
/// manager's Running → Stopped callback. Safe to call with an unknown gobject id — no-op.
|
||||
/// </summary>
|
||||
/// <param name="gobjectId">The runtime host's gobject_id.</param>
|
||||
public void MarkHostVariablesBadQuality(int gobjectId)
|
||||
{
|
||||
List<BaseDataVariableState>? variables;
|
||||
lock (Lock)
|
||||
{
|
||||
if (!_hostedVariables.TryGetValue(gobjectId, out variables))
|
||||
return;
|
||||
|
||||
var now = DateTime.UtcNow;
|
||||
foreach (var variable in variables)
|
||||
{
|
||||
variable.StatusCode = StatusCodes.BadOutOfService;
|
||||
variable.Timestamp = now;
|
||||
variable.ClearChangeMasks(SystemContext, false);
|
||||
}
|
||||
}
|
||||
|
||||
Log.Information(
|
||||
"Marked {Count} variable(s) BadOutOfService for stopped host gobject_id={GobjectId}",
|
||||
variables.Count, gobjectId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets every OPC UA variable hosted by the given Galaxy runtime object to
|
||||
/// <see cref="StatusCodes.Good"/>. Invoked by the runtime probe manager's Stopped → Running
|
||||
/// callback. Values are left as-is; subsequent MxAccess on-change updates will refresh them
|
||||
/// as tags change naturally.
|
||||
/// </summary>
|
||||
/// <param name="gobjectId">The runtime host's gobject_id.</param>
|
||||
public void ClearHostVariablesBadQuality(int gobjectId)
|
||||
{
|
||||
List<BaseDataVariableState>? variables;
|
||||
lock (Lock)
|
||||
{
|
||||
if (!_hostedVariables.TryGetValue(gobjectId, out variables))
|
||||
return;
|
||||
|
||||
var now = DateTime.UtcNow;
|
||||
foreach (var variable in variables)
|
||||
{
|
||||
variable.StatusCode = StatusCodes.Good;
|
||||
variable.Timestamp = now;
|
||||
variable.ClearChangeMasks(SystemContext, false);
|
||||
}
|
||||
}
|
||||
|
||||
Log.Information(
|
||||
"Cleared bad-quality override on {Count} variable(s) for recovered host gobject_id={GobjectId}",
|
||||
variables.Count, gobjectId);
|
||||
}
|
||||
|
||||
private void SubscribeAlarmTags()
|
||||
{
|
||||
foreach (var kvp in _alarmInAlarmTags)
|
||||
@@ -2116,6 +2279,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
if (_dispatchDisposed)
|
||||
return;
|
||||
|
||||
// Runtime status probes are bridge-owned subscriptions whose only job is to drive the
|
||||
// host state machine; they are NOT in _tagToVariableNode, so the normal dispatch path
|
||||
// would drop them anyway. Route probe addresses directly to the probe manager and skip
|
||||
// the dispatch queue entirely.
|
||||
if (_galaxyRuntimeProbeManager != null
|
||||
&& _galaxyRuntimeProbeManager.HandleProbeUpdate(address, vtq))
|
||||
return;
|
||||
|
||||
Interlocked.Increment(ref _totalMxChangeEvents);
|
||||
_pendingDataChanges[address] = vtq;
|
||||
try
|
||||
@@ -2162,6 +2333,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
if (!_dispatchRunning)
|
||||
break;
|
||||
|
||||
// Drive time-based probe state transitions on every dispatch tick. The dispatch
|
||||
// loop already wakes every 100ms via the WaitOne timeout, so this gives us a
|
||||
// ~10Hz cadence for the Unknown → Stopped timeout without introducing a new
|
||||
// thread or timer. No-op when the probe manager is disabled.
|
||||
_galaxyRuntimeProbeManager?.Tick();
|
||||
|
||||
var keys = _pendingDataChanges.Keys.ToList();
|
||||
if (keys.Count == 0)
|
||||
{
|
||||
@@ -2376,6 +2553,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
{
|
||||
_dispatchDisposed = true;
|
||||
_mxAccessClient.OnTagValueChanged -= OnMxAccessDataChange;
|
||||
// Dispose the runtime probe manager before the MxAccess client teardown so its
|
||||
// Unadvise calls reach a live client. Disposing the node manager normally runs
|
||||
// BEFORE the node manager's containing OpcUaServerHost releases the MxAccess
|
||||
// client, so the probes close cleanly.
|
||||
_galaxyRuntimeProbeManager?.Dispose();
|
||||
StopDispatchThread();
|
||||
_dataChangeSignal.Dispose();
|
||||
}
|
||||
|
||||
@@ -37,11 +37,16 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
private NodeId? _writeOperateRoleId;
|
||||
private NodeId? _writeTuneRoleId;
|
||||
|
||||
private readonly bool _runtimeStatusProbesEnabled;
|
||||
private readonly int _runtimeStatusUnknownTimeoutSeconds;
|
||||
|
||||
public LmxOpcUaServer(string galaxyName, IMxAccessClient mxAccessClient, PerformanceMetrics metrics,
|
||||
IHistorianDataSource? historianDataSource = null, bool alarmTrackingEnabled = false,
|
||||
AuthenticationConfiguration? authConfig = null, IUserAuthenticationProvider? authProvider = null,
|
||||
RedundancyConfiguration? redundancyConfig = null, string? applicationUri = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null)
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
bool runtimeStatusProbesEnabled = false,
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15)
|
||||
{
|
||||
_galaxyName = galaxyName;
|
||||
_mxAccessClient = mxAccessClient;
|
||||
@@ -53,6 +58,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_authProvider = authProvider;
|
||||
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
|
||||
_applicationUri = applicationUri;
|
||||
_runtimeStatusProbesEnabled = runtimeStatusProbesEnabled;
|
||||
_runtimeStatusUnknownTimeoutSeconds = runtimeStatusUnknownTimeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -89,7 +96,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
NodeManager = new LmxNodeManager(server, configuration, namespaceUri, _mxAccessClient, _metrics,
|
||||
_historianDataSource, _alarmTrackingEnabled, _authConfig.AnonymousCanWrite,
|
||||
_writeOperateRoleId, _writeTuneRoleId, _writeConfigureRoleId, _alarmAckRoleId,
|
||||
_alarmObjectFilter);
|
||||
_alarmObjectFilter,
|
||||
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds);
|
||||
|
||||
var nodeManagers = new List<INodeManager> { NodeManager };
|
||||
return new MasterNodeManager(server, configuration, null, nodeManagers.ToArray());
|
||||
|
||||
@@ -45,7 +45,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
IUserAuthenticationProvider? authProvider = null,
|
||||
SecurityProfileConfiguration? securityConfig = null,
|
||||
RedundancyConfiguration? redundancyConfig = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null)
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
MxAccessConfiguration? mxAccessConfig = null)
|
||||
{
|
||||
_config = config;
|
||||
_mxAccessClient = mxAccessClient;
|
||||
@@ -56,8 +57,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_securityConfig = securityConfig ?? new SecurityProfileConfiguration();
|
||||
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
|
||||
_alarmObjectFilter = alarmObjectFilter;
|
||||
_mxAccessConfig = mxAccessConfig ?? new MxAccessConfiguration();
|
||||
}
|
||||
|
||||
private readonly MxAccessConfiguration _mxAccessConfig;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the active node manager that holds the published Galaxy namespace.
|
||||
/// </summary>
|
||||
@@ -239,7 +243,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
|
||||
_server = new LmxOpcUaServer(_config.GalaxyName, _mxAccessClient, _metrics, _historianDataSource,
|
||||
_config.AlarmTrackingEnabled, _authConfig, _authProvider, _redundancyConfig, applicationUri,
|
||||
_alarmObjectFilter);
|
||||
_alarmObjectFilter,
|
||||
_mxAccessConfig.RuntimeStatusProbesEnabled,
|
||||
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds);
|
||||
await _application.Start(_server);
|
||||
|
||||
Log.Information(
|
||||
|
||||
@@ -245,7 +245,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
|
||||
string.Join(", ", _config.OpcUa.AlarmFilter.ObjectFilters));
|
||||
|
||||
ServerHost = new OpcUaServerHost(_config.OpcUa, effectiveMxClient, Metrics, _historianDataSource,
|
||||
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter);
|
||||
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter,
|
||||
_config.MxAccess);
|
||||
|
||||
// Step 9-10: Query hierarchy, start server, build address space
|
||||
DateTime? initialDeployTime = null;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using System.Linq;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
|
||||
|
||||
@@ -21,7 +22,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
ConnectionState connectionState,
|
||||
PerformanceMetrics? metrics,
|
||||
HistorianStatusInfo? historian = null,
|
||||
AlarmStatusInfo? alarms = null)
|
||||
AlarmStatusInfo? alarms = null,
|
||||
RuntimeStatusInfo? runtime = null)
|
||||
{
|
||||
// Rule 1: Not connected → Unhealthy
|
||||
if (connectionState != ConnectionState.Connected)
|
||||
@@ -98,6 +100,23 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
Color = "yellow"
|
||||
};
|
||||
|
||||
// Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded.
|
||||
// Runs after the transport check so that MxAccess-disconnected remains Unhealthy via
|
||||
// Rule 1 without also firing the runtime rule — avoids a double-message when the
|
||||
// transport is the root cause of every host going Unknown/Stopped.
|
||||
if (runtime != null && runtime.StoppedCount > 0)
|
||||
{
|
||||
var stoppedNames = string.Join(", ",
|
||||
runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName));
|
||||
return new HealthInfo
|
||||
{
|
||||
Status = "Degraded",
|
||||
Message =
|
||||
$"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}",
|
||||
Color = "yellow"
|
||||
};
|
||||
}
|
||||
|
||||
// Rule 3: All good
|
||||
return new HealthInfo
|
||||
{
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Domain;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
@@ -59,12 +60,49 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
/// </summary>
|
||||
public EndpointsInfo Endpoints { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the Galaxy runtime host state (Platforms + AppEngines).
|
||||
/// </summary>
|
||||
public RuntimeStatusInfo RuntimeStatus { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets footer details such as the snapshot timestamp and service version.
|
||||
/// </summary>
|
||||
public FooterInfo Footer { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dashboard model summarizing per-host Galaxy runtime state.
|
||||
/// </summary>
|
||||
public class RuntimeStatusInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the total number of tracked runtime hosts ($WinPlatform + $AppEngine).
|
||||
/// </summary>
|
||||
public int Total { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the count of hosts currently reported Running.
|
||||
/// </summary>
|
||||
public int RunningCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the count of hosts currently reported Stopped.
|
||||
/// </summary>
|
||||
public int StoppedCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the count of hosts whose state is still Unknown (either awaiting initial
|
||||
/// probe resolution or transported-through-disconnected).
|
||||
/// </summary>
|
||||
public int UnknownCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the per-host state in stable alphabetical order.
|
||||
/// </summary>
|
||||
public List<GalaxyRuntimeStatus> Hosts { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dashboard model describing the OPC UA server's listening endpoints and active security profiles.
|
||||
/// </summary>
|
||||
@@ -156,8 +194,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the number of active tag subscriptions mirrored from MXAccess into OPC UA.
|
||||
/// This total includes bridge-owned runtime status probes; see <see cref="ProbeCount"/> for the
|
||||
/// subset attributable to probes.
|
||||
/// </summary>
|
||||
public int ActiveCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the count of bridge-owned runtime status probes included in
|
||||
/// <see cref="ActiveCount"/>. Surfaced on the dashboard so operators can distinguish probe
|
||||
/// overhead from client-driven subscription load.
|
||||
/// </summary>
|
||||
public int ProbeCount { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -88,10 +88,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
ReconnectCount = _mxAccessClient?.ReconnectCount ?? 0,
|
||||
ActiveSessions = _serverHost?.ActiveSessionCount ?? 0
|
||||
},
|
||||
Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo),
|
||||
Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo, BuildRuntimeStatusInfo()),
|
||||
Subscriptions = new SubscriptionInfo
|
||||
{
|
||||
ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0
|
||||
ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0,
|
||||
ProbeCount = _nodeManager?.ActiveRuntimeProbeCount ?? 0
|
||||
},
|
||||
Galaxy = new GalaxyInfo
|
||||
{
|
||||
@@ -114,6 +115,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
Alarms = alarmInfo,
|
||||
Redundancy = BuildRedundancyInfo(),
|
||||
Endpoints = BuildEndpointsInfo(),
|
||||
RuntimeStatus = BuildRuntimeStatusInfo(),
|
||||
Footer = new FooterInfo
|
||||
{
|
||||
Timestamp = DateTime.UtcNow,
|
||||
@@ -192,6 +194,26 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
return info;
|
||||
}
|
||||
|
||||
private RuntimeStatusInfo BuildRuntimeStatusInfo()
|
||||
{
|
||||
var hosts = _nodeManager?.RuntimeStatuses?.ToList() ?? new List<GalaxyRuntimeStatus>();
|
||||
var info = new RuntimeStatusInfo
|
||||
{
|
||||
Total = hosts.Count,
|
||||
Hosts = hosts
|
||||
};
|
||||
foreach (var host in hosts)
|
||||
{
|
||||
switch (host.State)
|
||||
{
|
||||
case GalaxyRuntimeState.Running: info.RunningCount++; break;
|
||||
case GalaxyRuntimeState.Stopped: info.StoppedCount++; break;
|
||||
default: info.UnknownCount++; break;
|
||||
}
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
private RedundancyInfo? BuildRedundancyInfo()
|
||||
{
|
||||
if (_redundancyConfig == null || !_redundancyConfig.Enabled)
|
||||
@@ -300,7 +322,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
|
||||
// Subscriptions panel
|
||||
sb.AppendLine("<div class='panel gray'><h2>Subscriptions</h2>");
|
||||
sb.AppendLine($"<p>Active: {data.Subscriptions.ActiveCount}</p>");
|
||||
sb.AppendLine($"<p>Active: <b>{data.Subscriptions.ActiveCount}</b></p>");
|
||||
if (data.Subscriptions.ProbeCount > 0)
|
||||
sb.AppendLine(
|
||||
$"<p>Probes: {data.Subscriptions.ProbeCount} (bridge-owned runtime status)</p>");
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Data Change Dispatch panel
|
||||
@@ -318,6 +343,32 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
sb.AppendLine($"<p>Last Rebuild: {data.Galaxy.LastRebuildTime:O}</p>");
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Galaxy Runtime panel — per-host Platform + AppEngine state
|
||||
if (data.RuntimeStatus.Total > 0)
|
||||
{
|
||||
var rtColor = data.RuntimeStatus.StoppedCount > 0 ? "red"
|
||||
: data.RuntimeStatus.UnknownCount > 0 ? "yellow"
|
||||
: "green";
|
||||
sb.AppendLine($"<div class='panel {rtColor}'><h2>Galaxy Runtime</h2>");
|
||||
sb.AppendLine(
|
||||
$"<p>{data.RuntimeStatus.RunningCount} of {data.RuntimeStatus.Total} hosts running" +
|
||||
$" ({data.RuntimeStatus.StoppedCount} stopped, {data.RuntimeStatus.UnknownCount} unknown)</p>");
|
||||
sb.AppendLine("<table><tr><th>Name</th><th>Kind</th><th>State</th><th>Since</th><th>Last Error</th></tr>");
|
||||
foreach (var host in data.RuntimeStatus.Hosts)
|
||||
{
|
||||
var since = host.LastStateChangeTime?.ToString("O") ?? "-";
|
||||
var err = WebUtility.HtmlEncode(host.LastError ?? "");
|
||||
sb.AppendLine(
|
||||
$"<tr><td>{WebUtility.HtmlEncode(host.ObjectName)}</td>" +
|
||||
$"<td>{WebUtility.HtmlEncode(host.Kind)}</td>" +
|
||||
$"<td>{host.State}</td>" +
|
||||
$"<td>{since}</td>" +
|
||||
$"<td><code>{err}</code></td></tr>");
|
||||
}
|
||||
sb.AppendLine("</table>");
|
||||
sb.AppendLine("</div>");
|
||||
}
|
||||
|
||||
// Historian panel
|
||||
var anyClusterNodeFailed =
|
||||
data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount;
|
||||
|
||||
@@ -23,7 +23,9 @@
|
||||
"MonitorIntervalSeconds": 5,
|
||||
"AutoReconnect": true,
|
||||
"ProbeTag": null,
|
||||
"ProbeStaleThresholdSeconds": 60
|
||||
"ProbeStaleThresholdSeconds": 60,
|
||||
"RuntimeStatusProbesEnabled": true,
|
||||
"RuntimeStatusUnknownTimeoutSeconds": 15
|
||||
},
|
||||
"GalaxyRepository": {
|
||||
"ConnectionString": "Server=localhost;Database=ZB;Integrated Security=true;",
|
||||
|
||||
Reference in New Issue
Block a user