Close all four stability-review 2026-04-13 findings so a failed runtime probe subscription can no longer leave a phantom entry that Tick() flips to Stopped and fans out false BadOutOfService quality across a host's subtree, a silently-failed dashboard bind no longer lets the service advertise a successful start while an operator-visible endpoint is dead, the seven sync-over-async sites in LmxNodeManager (rebuild probe sync, Read, Write, four HistoryRead overrides) can no longer park the OPC UA stack thread indefinitely on a hung backend, and alarm auto-subscribe + transferred-subscription restore no longer race shutdown as untracked fire-and-forget tasks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-14 00:48:07 -04:00
parent 731092595f
commit c76ab8fdee
21 changed files with 869 additions and 53 deletions

View File

@@ -74,8 +74,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
config.MxAccess.MonitorIntervalSeconds, config.MxAccess.AutoReconnect,
config.MxAccess.ProbeTag ?? "(none)", config.MxAccess.ProbeStaleThresholdSeconds);
Log.Information(
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s",
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s, RequestTimeoutSeconds={RequestTimeout}s",
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds,
config.MxAccess.RequestTimeoutSeconds);
if (string.IsNullOrWhiteSpace(config.MxAccess.ClientName))
{
@@ -88,6 +89,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
"MxAccess.RuntimeStatusUnknownTimeoutSeconds={Timeout} is below the recommended floor of 5s; initial probe resolution may time out before MxAccess has delivered the first callback",
config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
if (config.MxAccess.RequestTimeoutSeconds < 1)
{
Log.Error("MxAccess.RequestTimeoutSeconds must be at least 1");
valid = false;
}
else if (config.MxAccess.RequestTimeoutSeconds <
Math.Max(config.MxAccess.ReadTimeoutSeconds, config.MxAccess.WriteTimeoutSeconds))
{
Log.Warning(
"MxAccess.RequestTimeoutSeconds={RequestTimeout} is below Read/Write inner timeouts ({Read}s/{Write}s); outer safety bound may fire before the inner client completes its own error path",
config.MxAccess.RequestTimeoutSeconds,
config.MxAccess.ReadTimeoutSeconds, config.MxAccess.WriteTimeoutSeconds);
}
// Galaxy Repository
Log.Information(
"GalaxyRepository.ConnectionString={ConnectionString}, ChangeDetectionInterval={ChangeInterval}s, CommandTimeout={CmdTimeout}s, ExtendedAttributes={ExtendedAttributes}",
@@ -145,9 +160,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
config.Historian.Enabled, effectiveNodes, config.Historian.IntegratedSecurity,
config.Historian.Port);
Log.Information(
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}",
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}, RequestTimeoutSeconds={RequestTimeout}",
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead,
config.Historian.FailureCooldownSeconds);
config.Historian.FailureCooldownSeconds, config.Historian.RequestTimeoutSeconds);
if (config.Historian.Enabled)
{
@@ -163,6 +178,18 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
valid = false;
}
if (config.Historian.RequestTimeoutSeconds < 1)
{
Log.Error("Historian.RequestTimeoutSeconds must be at least 1");
valid = false;
}
else if (config.Historian.RequestTimeoutSeconds < config.Historian.CommandTimeoutSeconds)
{
Log.Warning(
"Historian.RequestTimeoutSeconds={RequestTimeout} is below CommandTimeoutSeconds={CmdTimeout}; outer safety bound may fire before the inner SDK completes its own error path",
config.Historian.RequestTimeoutSeconds, config.Historian.CommandTimeoutSeconds);
}
if (clusterNodes.Count > 0 && !string.IsNullOrWhiteSpace(config.Historian.ServerName)
&& config.Historian.ServerName != "localhost")
Log.Warning(

View File

@@ -63,5 +63,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
/// </summary>
public int MaxValuesPerRead { get; set; } = 10000;
/// <summary>
/// Gets or sets an outer safety timeout, in seconds, applied to sync-over-async Historian
/// operations invoked from the OPC UA stack thread (HistoryReadRaw, HistoryReadProcessed,
/// HistoryReadAtTime, HistoryReadEvents). This is a backstop for the case where a
/// historian query hangs outside <see cref="CommandTimeoutSeconds"/> — e.g., a slow SDK
/// reconnect or mid-failover cluster node. Must be comfortably larger than
/// <see cref="CommandTimeoutSeconds"/> so normal operation is never affected. Default 60s.
/// </summary>
public int RequestTimeoutSeconds { get; set; } = 60;
}
}

View File

@@ -30,6 +30,16 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
/// </summary>
public int WriteTimeoutSeconds { get; set; } = 5;
/// <summary>
/// Gets or sets an outer safety timeout, in seconds, applied to sync-over-async MxAccess
/// operations invoked from the OPC UA stack thread (Read, Write, address-space rebuild probe
/// sync). This is a backstop for the case where an async path hangs outside the inner
/// <see cref="ReadTimeoutSeconds"/> / <see cref="WriteTimeoutSeconds"/> bounds — e.g., a
/// slow reconnect or a scheduler stall. Must be comfortably larger than the inner timeouts
/// so normal operation is never affected. Default 30s.
/// </summary>
public int RequestTimeoutSeconds { get; set; } = 30;
/// <summary>
/// Gets or sets the cap on concurrent MXAccess operations so the bridge does not overload the runtime.
/// </summary>

View File

@@ -171,11 +171,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
}
// Compute diffs under lock, release lock before issuing SDK calls (which can block).
List<string> toSubscribe;
// toSubscribe carries the gobject id alongside the probe name so the rollback path on
// subscribe failure can unwind both dictionaries without a reverse lookup.
List<(int GobjectId, string Probe)> toSubscribe;
List<string> toUnsubscribe;
lock (_lock)
{
toSubscribe = new List<string>();
toSubscribe = new List<(int, string)>();
toUnsubscribe = new List<string>();
foreach (var kvp in desired)
@@ -190,14 +192,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
_byProbe.Remove(existingProbe);
_probeByGobjectId.Remove(kvp.Key);
toSubscribe.Add(kvp.Value.Probe);
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
}
}
else
{
toSubscribe.Add(kvp.Value.Probe);
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
}
@@ -215,7 +217,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
}
// Apply the diff outside the lock.
foreach (var probe in toSubscribe)
foreach (var (gobjectId, probe) in toSubscribe)
{
try
{
@@ -225,6 +227,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
catch (Exception ex)
{
Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe);
// Roll back the pending entry so Tick() can't later transition a never-advised
// probe from Unknown to Stopped and fan out a false-negative host-down signal.
// A concurrent SyncAsync may have re-added the same gobject under a new probe
// name, so compare against the captured probe string before removing.
lock (_lock)
{
if (_probeByGobjectId.TryGetValue(gobjectId, out var current)
&& string.Equals(current, probe, StringComparison.OrdinalIgnoreCase))
{
_probeByGobjectId.Remove(gobjectId);
}
_byProbe.Remove(probe);
}
}
}

View File

@@ -11,6 +11,7 @@ using ZB.MOM.WW.LmxOpcUa.Host.Domain;
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
using ZB.MOM.WW.LmxOpcUa.Host.MxAccess;
using ZB.MOM.WW.LmxOpcUa.Host.Utilities;
namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
{
@@ -107,6 +108,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private readonly NodeId? _writeConfigureRoleId;
private readonly NodeId? _writeOperateRoleId;
private readonly NodeId? _writeTuneRoleId;
private readonly TimeSpan _mxAccessRequestTimeout;
private readonly TimeSpan _historianRequestTimeout;
private long _dispatchCycleCount;
private long _suppressedUpdatesCount;
private volatile bool _dispatchDisposed;
@@ -128,6 +131,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private long _alarmAckEventCount;
private long _alarmAckWriteFailures;
// Background subscribe tracking: every fire-and-forget SubscribeAsync for alarm auto-subscribe
// and transferred-subscription restore is registered here so shutdown can drain pending work
// with a bounded timeout, and so tests can observe pending count without races.
private readonly ConcurrentDictionary<long, Task> _pendingBackgroundSubscribes =
new ConcurrentDictionary<long, Task>();
private long _backgroundSubscribeCounter;
/// <summary>
/// Initializes a new node manager for the Galaxy-backed OPC UA namespace.
/// </summary>
@@ -156,7 +166,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
NodeId? alarmAckRoleId = null,
AlarmObjectFilter? alarmObjectFilter = null,
bool runtimeStatusProbesEnabled = false,
int runtimeStatusUnknownTimeoutSeconds = 15)
int runtimeStatusUnknownTimeoutSeconds = 15,
int mxAccessRequestTimeoutSeconds = 30,
int historianRequestTimeoutSeconds = 60)
: base(server, configuration, namespaceUri)
{
_namespaceUri = namespaceUri;
@@ -170,6 +182,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_writeTuneRoleId = writeTuneRoleId;
_writeConfigureRoleId = writeConfigureRoleId;
_alarmAckRoleId = alarmAckRoleId;
_mxAccessRequestTimeout = TimeSpan.FromSeconds(Math.Max(1, mxAccessRequestTimeoutSeconds));
_historianRequestTimeout = TimeSpan.FromSeconds(Math.Max(1, historianRequestTimeoutSeconds));
if (runtimeStatusProbesEnabled)
{
@@ -569,7 +583,24 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
// Sync the galaxy runtime probe set against the rebuilt hierarchy. This runs
// synchronously on the calling thread and issues AdviseSupervisory per host —
// expected 500ms-1s additional startup latency for a large multi-host galaxy.
_galaxyRuntimeProbeManager?.SyncAsync(hierarchy).GetAwaiter().GetResult();
// Bounded by _mxAccessRequestTimeout so a hung probe sync cannot park the address
// space rebuild indefinitely; on timeout we log a warning and continue with the
// partial probe set (probe sync is advisory, not required for address space correctness).
if (_galaxyRuntimeProbeManager != null)
{
try
{
SyncOverAsync.WaitSync(
_galaxyRuntimeProbeManager.SyncAsync(hierarchy),
_mxAccessRequestTimeout,
"GalaxyRuntimeProbeManager.SyncAsync");
}
catch (TimeoutException ex)
{
Log.Warning(ex, "Runtime probe sync exceeded {Timeout}s; continuing with partial probe set",
_mxAccessRequestTimeout.TotalSeconds);
}
}
_lastHierarchy = new List<GalaxyObjectInfo>(hierarchy);
_lastAttributes = new List<GalaxyAttributeInfo>(attributes);
@@ -854,15 +885,40 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
{
if (string.IsNullOrEmpty(tag) || !_tagToVariableNode.ContainsKey(tag))
continue;
var alarmTag = tag;
_mxAccessClient.SubscribeAsync(alarmTag, (_, _) => { })
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
"Failed to auto-subscribe to alarm tag {Tag}", alarmTag),
TaskContinuationOptions.OnlyOnFaulted);
TrackBackgroundSubscribe(tag, "alarm auto-subscribe");
}
}
}
/// <summary>
/// Issues a fire-and-forget <c>SubscribeAsync</c> for <paramref name="tag"/> and registers
/// the resulting task so shutdown can drain pending work with a bounded timeout. The
/// continuation both removes the completed entry and logs faults with the supplied
/// <paramref name="context"/>.
/// </summary>
private void TrackBackgroundSubscribe(string tag, string context)
{
if (_dispatchDisposed)
return;
var id = Interlocked.Increment(ref _backgroundSubscribeCounter);
var task = _mxAccessClient.SubscribeAsync(tag, (_, _) => { });
_pendingBackgroundSubscribes[id] = task;
task.ContinueWith(t =>
{
_pendingBackgroundSubscribes.TryRemove(id, out _);
if (t.IsFaulted)
Log.Warning(t.Exception?.InnerException, "Background subscribe failed ({Context}) for {Tag}",
context, tag);
}, TaskContinuationOptions.ExecuteSynchronously);
}
/// <summary>
/// Gets the number of background subscribe tasks currently in flight. Exposed for tests
/// and for the status dashboard subscription panel.
/// </summary>
internal int PendingBackgroundSubscribeCount => _pendingBackgroundSubscribes.Count;
private ServiceResult OnAlarmAcknowledge(
ISystemContext context, ConditionState condition, byte[] eventId, LocalizedText comment)
{
@@ -1358,11 +1414,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
{
if (string.IsNullOrEmpty(tag) || !_tagToVariableNode.ContainsKey(tag))
continue;
var subtreeAlarmTag = tag;
_mxAccessClient.SubscribeAsync(subtreeAlarmTag, (_, _) => { })
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
"Failed to subscribe alarm tag in subtree {Tag}", subtreeAlarmTag),
TaskContinuationOptions.OnlyOnFaulted);
TrackBackgroundSubscribe(tag, "subtree alarm auto-subscribe");
}
}
}
@@ -1705,10 +1757,18 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
try
{
var vtq = _mxAccessClient.ReadAsync(tagRef).GetAwaiter().GetResult();
var vtq = SyncOverAsync.WaitSync(
_mxAccessClient.ReadAsync(tagRef),
_mxAccessRequestTimeout,
"MxAccessClient.ReadAsync");
results[i] = CreatePublishedDataValue(tagRef, vtq);
errors[i] = ServiceResult.Good;
}
catch (TimeoutException ex)
{
Log.Warning(ex, "Read timed out for {TagRef}", tagRef);
errors[i] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
Log.Warning(ex, "Read failed for {TagRef}", tagRef);
@@ -1779,7 +1839,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
value = updatedArray;
}
var success = _mxAccessClient.WriteAsync(tagRef, value).GetAwaiter().GetResult();
var success = SyncOverAsync.WaitSync(
_mxAccessClient.WriteAsync(tagRef, value),
_mxAccessRequestTimeout,
"MxAccessClient.WriteAsync");
if (success)
{
PublishLocalWrite(tagRef, value);
@@ -1790,6 +1853,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
errors[i] = new ServiceResult(StatusCodes.BadInternalError);
}
}
catch (TimeoutException ex)
{
Log.Warning(ex, "Write timed out for {TagRef}", tagRef);
errors[i] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
Log.Warning(ex, "Write failed for {TagRef}", tagRef);
@@ -2017,15 +2085,23 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
try
{
var maxValues = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
var dataValues = _historianDataSource.ReadRawAsync(
tagRef, details.StartTime, details.EndTime, maxValues)
.GetAwaiter().GetResult();
var dataValues = SyncOverAsync.WaitSync(
_historianDataSource.ReadRawAsync(
tagRef, details.StartTime, details.EndTime, maxValues),
_historianRequestTimeout,
"HistorianDataSource.ReadRawAsync");
if (details.ReturnBounds)
AddBoundingValues(dataValues, details.StartTime, details.EndTime);
ReturnHistoryPage(dataValues, details.NumValuesPerNode, results, errors, idx);
}
catch (TimeoutException ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead raw timed out for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
@@ -2094,13 +2170,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
using var historyScope = _metrics.BeginOperation("HistoryReadProcessed");
try
{
var dataValues = _historianDataSource.ReadAggregateAsync(
var dataValues = SyncOverAsync.WaitSync(
_historianDataSource.ReadAggregateAsync(
tagRef, details.StartTime, details.EndTime,
details.ProcessingInterval, column)
.GetAwaiter().GetResult();
details.ProcessingInterval, column),
_historianRequestTimeout,
"HistorianDataSource.ReadAggregateAsync");
ReturnHistoryPage(dataValues, 0, results, errors, idx);
}
catch (TimeoutException ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead processed timed out for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
@@ -2150,8 +2234,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
for (var i = 0; i < details.ReqTimes.Count; i++)
timestamps[i] = details.ReqTimes[i];
var dataValues = _historianDataSource.ReadAtTimeAsync(tagRef, timestamps)
.GetAwaiter().GetResult();
var dataValues = SyncOverAsync.WaitSync(
_historianDataSource.ReadAtTimeAsync(tagRef, timestamps),
_historianRequestTimeout,
"HistorianDataSource.ReadAtTimeAsync");
var historyData = new HistoryData();
historyData.DataValues.AddRange(dataValues);
@@ -2163,6 +2249,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
};
errors[idx] = ServiceResult.Good;
}
catch (TimeoutException ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead at-time timed out for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
@@ -2215,9 +2307,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
try
{
var maxEvents = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
var events = _historianDataSource.ReadEventsAsync(
sourceName, details.StartTime, details.EndTime, maxEvents)
.GetAwaiter().GetResult();
var events = SyncOverAsync.WaitSync(
_historianDataSource.ReadEventsAsync(
sourceName, details.StartTime, details.EndTime, maxEvents),
_historianRequestTimeout,
"HistorianDataSource.ReadEventsAsync");
var historyEvent = new HistoryEvent();
foreach (var evt in events)
@@ -2247,6 +2341,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
};
errors[idx] = ServiceResult.Good;
}
catch (TimeoutException ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead events timed out for {NodeId}", nodeIdStr);
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
@@ -2476,13 +2576,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
foreach (var tagRef in tagsToSubscribe)
{
var transferTag = tagRef;
_mxAccessClient.SubscribeAsync(transferTag, (_, _) => { })
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
"Failed to restore subscription for transferred tag {Tag}", transferTag),
TaskContinuationOptions.OnlyOnFaulted);
}
TrackBackgroundSubscribe(tagRef, "transferred subscription restore");
}
private void OnMxAccessDataChange(string address, Vtq vtq)
@@ -2798,12 +2892,33 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
// client, so the probes close cleanly.
_galaxyRuntimeProbeManager?.Dispose();
StopDispatchThread();
DrainPendingBackgroundSubscribes();
_dataChangeSignal.Dispose();
}
base.Dispose(disposing);
}
private void DrainPendingBackgroundSubscribes()
{
var snapshot = _pendingBackgroundSubscribes.Values.ToArray();
if (snapshot.Length == 0)
return;
try
{
Task.WaitAll(snapshot, TimeSpan.FromSeconds(5));
Log.Information("Drained {Count} pending background subscribe(s) on shutdown", snapshot.Length);
}
catch (AggregateException ex)
{
// Individual faults were already logged by the tracked continuation; record the
// aggregate at debug level to aid diagnosis without double-logging each failure.
Log.Debug(ex, "Background subscribe drain completed with {FaultCount} fault(s)",
ex.InnerExceptions.Count);
}
}
#endregion
}
}

View File

@@ -39,6 +39,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private readonly bool _runtimeStatusProbesEnabled;
private readonly int _runtimeStatusUnknownTimeoutSeconds;
private readonly int _mxAccessRequestTimeoutSeconds;
private readonly int _historianRequestTimeoutSeconds;
public LmxOpcUaServer(string galaxyName, IMxAccessClient mxAccessClient, PerformanceMetrics metrics,
IHistorianDataSource? historianDataSource = null, bool alarmTrackingEnabled = false,
@@ -46,7 +48,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
RedundancyConfiguration? redundancyConfig = null, string? applicationUri = null,
AlarmObjectFilter? alarmObjectFilter = null,
bool runtimeStatusProbesEnabled = false,
int runtimeStatusUnknownTimeoutSeconds = 15)
int runtimeStatusUnknownTimeoutSeconds = 15,
int mxAccessRequestTimeoutSeconds = 30,
int historianRequestTimeoutSeconds = 60)
{
_galaxyName = galaxyName;
_mxAccessClient = mxAccessClient;
@@ -60,6 +64,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_applicationUri = applicationUri;
_runtimeStatusProbesEnabled = runtimeStatusProbesEnabled;
_runtimeStatusUnknownTimeoutSeconds = runtimeStatusUnknownTimeoutSeconds;
_mxAccessRequestTimeoutSeconds = mxAccessRequestTimeoutSeconds;
_historianRequestTimeoutSeconds = historianRequestTimeoutSeconds;
}
/// <summary>
@@ -97,7 +103,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_historianDataSource, _alarmTrackingEnabled, _authConfig.AnonymousCanWrite,
_writeOperateRoleId, _writeTuneRoleId, _writeConfigureRoleId, _alarmAckRoleId,
_alarmObjectFilter,
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds);
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds,
_mxAccessRequestTimeoutSeconds, _historianRequestTimeoutSeconds);
var nodeManagers = new List<INodeManager> { NodeManager };
return new MasterNodeManager(server, configuration, null, nodeManagers.ToArray());

View File

@@ -46,7 +46,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
SecurityProfileConfiguration? securityConfig = null,
RedundancyConfiguration? redundancyConfig = null,
AlarmObjectFilter? alarmObjectFilter = null,
MxAccessConfiguration? mxAccessConfig = null)
MxAccessConfiguration? mxAccessConfig = null,
HistorianConfiguration? historianConfig = null)
{
_config = config;
_mxAccessClient = mxAccessClient;
@@ -58,9 +59,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
_alarmObjectFilter = alarmObjectFilter;
_mxAccessConfig = mxAccessConfig ?? new MxAccessConfiguration();
_historianConfig = historianConfig ?? new HistorianConfiguration();
}
private readonly MxAccessConfiguration _mxAccessConfig;
private readonly HistorianConfiguration _historianConfig;
/// <summary>
/// Gets the active node manager that holds the published Galaxy namespace.
@@ -245,7 +248,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
_config.AlarmTrackingEnabled, _authConfig, _authProvider, _redundancyConfig, applicationUri,
_alarmObjectFilter,
_mxAccessConfig.RuntimeStatusProbesEnabled,
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds);
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds,
_mxAccessConfig.RequestTimeoutSeconds,
_historianConfig.RequestTimeoutSeconds);
await _application.Start(_server);
Log.Information(

View File

@@ -125,10 +125,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
internal ChangeDetectionService? ChangeDetectionInstance { get; private set; }
/// <summary>
/// Gets the hosted status web server when the dashboard is enabled.
/// Gets the hosted status web server when the dashboard is enabled and successfully bound.
/// Null when <c>Dashboard.Enabled</c> is false or when <see cref="DashboardStartFailed"/> is true.
/// </summary>
internal StatusWebServer? StatusWeb { get; private set; }
/// <summary>
/// Gets a flag indicating that the dashboard was enabled in configuration but failed to bind
/// its HTTP port at startup. The service continues in degraded mode (matching the pattern
/// for other optional subsystems: MxAccess connect, Galaxy DB connect, initial address space
/// build). Surfaced for tests and any external health probe that needs to distinguish
/// "dashboard disabled by config" from "dashboard failed to start".
/// </summary>
internal bool DashboardStartFailed { get; private set; }
/// <summary>
/// Gets the dashboard report generator used to assemble operator-facing status snapshots.
/// </summary>
@@ -246,7 +256,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
ServerHost = new OpcUaServerHost(_config.OpcUa, effectiveMxClient, Metrics, _historianDataSource,
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter,
_config.MxAccess);
_config.MxAccess, _config.Historian);
// Step 9-10: Query hierarchy, start server, build address space
DateTime? initialDeployTime = null;
@@ -304,8 +314,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
if (_config.Dashboard.Enabled)
{
StatusWeb = new StatusWebServer(StatusReportInstance, _config.Dashboard.Port);
StatusWeb.Start();
var dashboardServer = new StatusWebServer(StatusReportInstance, _config.Dashboard.Port);
if (dashboardServer.Start())
{
StatusWeb = dashboardServer;
}
else
{
// Degraded mode: StatusWebServer.Start() already logged the underlying exception.
// Dispose the unstarted instance, null out the reference, and flag the failure so
// tests and health probes can observe it. Service startup continues.
Log.Warning("Status dashboard failed to bind on port {Port}; service continues without dashboard",
_config.Dashboard.Port);
dashboardServer.Dispose();
DashboardStartFailed = true;
}
}
// Wire ServiceLevel updates from MXAccess health changes

View File

@@ -0,0 +1,53 @@
using System;
using System.Threading.Tasks;
namespace ZB.MOM.WW.LmxOpcUa.Host.Utilities
{
/// <summary>
/// Bounded safety wrappers for blocking on async tasks from synchronous OPC UA stack
/// callbacks (Read, Write, HistoryRead*, BuildAddressSpace). These are backstops: the
/// underlying MxAccess / Historian clients already enforce inner timeouts on the async
/// path, but an outer bound is still required so the stack thread cannot be parked
/// indefinitely by a hung scheduler, a slow reconnect, or any other non-returning
/// async path.
/// </summary>
/// <remarks>
/// On timeout, the underlying task is NOT cancelled — it runs to completion on the
/// thread pool and is abandoned. Callers must be comfortable with the fire-forget
/// semantics of the background continuation. This is acceptable for the current call
/// sites because MxAccess and Historian clients are shared singletons whose background
/// work does not capture request-scoped state.
/// </remarks>
internal static class SyncOverAsync
{
public static void WaitSync(Task task, TimeSpan timeout, string operation)
{
if (task == null) throw new ArgumentNullException(nameof(task));
try
{
if (!task.Wait(timeout))
throw new TimeoutException($"{operation} exceeded {timeout.TotalSeconds:0.#}s");
}
catch (AggregateException ae) when (ae.InnerExceptions.Count == 1)
{
// Unwrap the single inner exception so callers can write natural catch blocks.
throw ae.InnerExceptions[0];
}
}
public static T WaitSync<T>(Task<T> task, TimeSpan timeout, string operation)
{
if (task == null) throw new ArgumentNullException(nameof(task));
try
{
if (!task.Wait(timeout))
throw new TimeoutException($"{operation} exceeded {timeout.TotalSeconds:0.#}s");
return task.Result;
}
catch (AggregateException ae) when (ae.InnerExceptions.Count == 1)
{
throw ae.InnerExceptions[0];
}
}
}
}