Close all four stability-review 2026-04-13 findings so a failed runtime probe subscription can no longer leave a phantom entry that Tick() flips to Stopped and fans out false BadOutOfService quality across a host's subtree, a silently-failed dashboard bind no longer lets the service advertise a successful start while an operator-visible endpoint is dead, the seven sync-over-async sites in LmxNodeManager (rebuild probe sync, Read, Write, four HistoryRead overrides) can no longer park the OPC UA stack thread indefinitely on a hung backend, and alarm auto-subscribe + transferred-subscription restore no longer race shutdown as untracked fire-and-forget tasks.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -74,8 +74,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
config.MxAccess.MonitorIntervalSeconds, config.MxAccess.AutoReconnect,
|
||||
config.MxAccess.ProbeTag ?? "(none)", config.MxAccess.ProbeStaleThresholdSeconds);
|
||||
Log.Information(
|
||||
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s",
|
||||
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
|
||||
"MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s, RequestTimeoutSeconds={RequestTimeout}s",
|
||||
config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds,
|
||||
config.MxAccess.RequestTimeoutSeconds);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(config.MxAccess.ClientName))
|
||||
{
|
||||
@@ -88,6 +89,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
"MxAccess.RuntimeStatusUnknownTimeoutSeconds={Timeout} is below the recommended floor of 5s; initial probe resolution may time out before MxAccess has delivered the first callback",
|
||||
config.MxAccess.RuntimeStatusUnknownTimeoutSeconds);
|
||||
|
||||
if (config.MxAccess.RequestTimeoutSeconds < 1)
|
||||
{
|
||||
Log.Error("MxAccess.RequestTimeoutSeconds must be at least 1");
|
||||
valid = false;
|
||||
}
|
||||
else if (config.MxAccess.RequestTimeoutSeconds <
|
||||
Math.Max(config.MxAccess.ReadTimeoutSeconds, config.MxAccess.WriteTimeoutSeconds))
|
||||
{
|
||||
Log.Warning(
|
||||
"MxAccess.RequestTimeoutSeconds={RequestTimeout} is below Read/Write inner timeouts ({Read}s/{Write}s); outer safety bound may fire before the inner client completes its own error path",
|
||||
config.MxAccess.RequestTimeoutSeconds,
|
||||
config.MxAccess.ReadTimeoutSeconds, config.MxAccess.WriteTimeoutSeconds);
|
||||
}
|
||||
|
||||
// Galaxy Repository
|
||||
Log.Information(
|
||||
"GalaxyRepository.ConnectionString={ConnectionString}, ChangeDetectionInterval={ChangeInterval}s, CommandTimeout={CmdTimeout}s, ExtendedAttributes={ExtendedAttributes}",
|
||||
@@ -145,9 +160,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
config.Historian.Enabled, effectiveNodes, config.Historian.IntegratedSecurity,
|
||||
config.Historian.Port);
|
||||
Log.Information(
|
||||
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}",
|
||||
"Historian.CommandTimeoutSeconds={Timeout}, MaxValuesPerRead={MaxValues}, FailureCooldownSeconds={Cooldown}, RequestTimeoutSeconds={RequestTimeout}",
|
||||
config.Historian.CommandTimeoutSeconds, config.Historian.MaxValuesPerRead,
|
||||
config.Historian.FailureCooldownSeconds);
|
||||
config.Historian.FailureCooldownSeconds, config.Historian.RequestTimeoutSeconds);
|
||||
|
||||
if (config.Historian.Enabled)
|
||||
{
|
||||
@@ -163,6 +178,18 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
valid = false;
|
||||
}
|
||||
|
||||
if (config.Historian.RequestTimeoutSeconds < 1)
|
||||
{
|
||||
Log.Error("Historian.RequestTimeoutSeconds must be at least 1");
|
||||
valid = false;
|
||||
}
|
||||
else if (config.Historian.RequestTimeoutSeconds < config.Historian.CommandTimeoutSeconds)
|
||||
{
|
||||
Log.Warning(
|
||||
"Historian.RequestTimeoutSeconds={RequestTimeout} is below CommandTimeoutSeconds={CmdTimeout}; outer safety bound may fire before the inner SDK completes its own error path",
|
||||
config.Historian.RequestTimeoutSeconds, config.Historian.CommandTimeoutSeconds);
|
||||
}
|
||||
|
||||
if (clusterNodes.Count > 0 && !string.IsNullOrWhiteSpace(config.Historian.ServerName)
|
||||
&& config.Historian.ServerName != "localhost")
|
||||
Log.Warning(
|
||||
|
||||
@@ -63,5 +63,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
/// </summary>
|
||||
public int MaxValuesPerRead { get; set; } = 10000;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets an outer safety timeout, in seconds, applied to sync-over-async Historian
|
||||
/// operations invoked from the OPC UA stack thread (HistoryReadRaw, HistoryReadProcessed,
|
||||
/// HistoryReadAtTime, HistoryReadEvents). This is a backstop for the case where a
|
||||
/// historian query hangs outside <see cref="CommandTimeoutSeconds"/> — e.g., a slow SDK
|
||||
/// reconnect or mid-failover cluster node. Must be comfortably larger than
|
||||
/// <see cref="CommandTimeoutSeconds"/> so normal operation is never affected. Default 60s.
|
||||
/// </summary>
|
||||
public int RequestTimeoutSeconds { get; set; } = 60;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,16 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration
|
||||
/// </summary>
|
||||
public int WriteTimeoutSeconds { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets an outer safety timeout, in seconds, applied to sync-over-async MxAccess
|
||||
/// operations invoked from the OPC UA stack thread (Read, Write, address-space rebuild probe
|
||||
/// sync). This is a backstop for the case where an async path hangs outside the inner
|
||||
/// <see cref="ReadTimeoutSeconds"/> / <see cref="WriteTimeoutSeconds"/> bounds — e.g., a
|
||||
/// slow reconnect or a scheduler stall. Must be comfortably larger than the inner timeouts
|
||||
/// so normal operation is never affected. Default 30s.
|
||||
/// </summary>
|
||||
public int RequestTimeoutSeconds { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the cap on concurrent MXAccess operations so the bridge does not overload the runtime.
|
||||
/// </summary>
|
||||
|
||||
@@ -171,11 +171,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
|
||||
}
|
||||
|
||||
// Compute diffs under lock, release lock before issuing SDK calls (which can block).
|
||||
List<string> toSubscribe;
|
||||
// toSubscribe carries the gobject id alongside the probe name so the rollback path on
|
||||
// subscribe failure can unwind both dictionaries without a reverse lookup.
|
||||
List<(int GobjectId, string Probe)> toSubscribe;
|
||||
List<string> toUnsubscribe;
|
||||
lock (_lock)
|
||||
{
|
||||
toSubscribe = new List<string>();
|
||||
toSubscribe = new List<(int, string)>();
|
||||
toUnsubscribe = new List<string>();
|
||||
|
||||
foreach (var kvp in desired)
|
||||
@@ -190,14 +192,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
|
||||
_byProbe.Remove(existingProbe);
|
||||
_probeByGobjectId.Remove(kvp.Key);
|
||||
|
||||
toSubscribe.Add(kvp.Value.Probe);
|
||||
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
|
||||
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
||||
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
toSubscribe.Add(kvp.Value.Probe);
|
||||
toSubscribe.Add((kvp.Key, kvp.Value.Probe));
|
||||
_byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind);
|
||||
_probeByGobjectId[kvp.Key] = kvp.Value.Probe;
|
||||
}
|
||||
@@ -215,7 +217,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
|
||||
}
|
||||
|
||||
// Apply the diff outside the lock.
|
||||
foreach (var probe in toSubscribe)
|
||||
foreach (var (gobjectId, probe) in toSubscribe)
|
||||
{
|
||||
try
|
||||
{
|
||||
@@ -225,6 +227,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe);
|
||||
|
||||
// Roll back the pending entry so Tick() can't later transition a never-advised
|
||||
// probe from Unknown to Stopped and fan out a false-negative host-down signal.
|
||||
// A concurrent SyncAsync may have re-added the same gobject under a new probe
|
||||
// name, so compare against the captured probe string before removing.
|
||||
lock (_lock)
|
||||
{
|
||||
if (_probeByGobjectId.TryGetValue(gobjectId, out var current)
|
||||
&& string.Equals(current, probe, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
_probeByGobjectId.Remove(gobjectId);
|
||||
}
|
||||
_byProbe.Remove(probe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ using ZB.MOM.WW.LmxOpcUa.Host.Domain;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Historian;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Metrics;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.MxAccess;
|
||||
using ZB.MOM.WW.LmxOpcUa.Host.Utilities;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
{
|
||||
@@ -107,6 +108,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
private readonly NodeId? _writeConfigureRoleId;
|
||||
private readonly NodeId? _writeOperateRoleId;
|
||||
private readonly NodeId? _writeTuneRoleId;
|
||||
private readonly TimeSpan _mxAccessRequestTimeout;
|
||||
private readonly TimeSpan _historianRequestTimeout;
|
||||
private long _dispatchCycleCount;
|
||||
private long _suppressedUpdatesCount;
|
||||
private volatile bool _dispatchDisposed;
|
||||
@@ -128,6 +131,13 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
private long _alarmAckEventCount;
|
||||
private long _alarmAckWriteFailures;
|
||||
|
||||
// Background subscribe tracking: every fire-and-forget SubscribeAsync for alarm auto-subscribe
|
||||
// and transferred-subscription restore is registered here so shutdown can drain pending work
|
||||
// with a bounded timeout, and so tests can observe pending count without races.
|
||||
private readonly ConcurrentDictionary<long, Task> _pendingBackgroundSubscribes =
|
||||
new ConcurrentDictionary<long, Task>();
|
||||
private long _backgroundSubscribeCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new node manager for the Galaxy-backed OPC UA namespace.
|
||||
/// </summary>
|
||||
@@ -156,7 +166,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
NodeId? alarmAckRoleId = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
bool runtimeStatusProbesEnabled = false,
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15)
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15,
|
||||
int mxAccessRequestTimeoutSeconds = 30,
|
||||
int historianRequestTimeoutSeconds = 60)
|
||||
: base(server, configuration, namespaceUri)
|
||||
{
|
||||
_namespaceUri = namespaceUri;
|
||||
@@ -170,6 +182,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_writeTuneRoleId = writeTuneRoleId;
|
||||
_writeConfigureRoleId = writeConfigureRoleId;
|
||||
_alarmAckRoleId = alarmAckRoleId;
|
||||
_mxAccessRequestTimeout = TimeSpan.FromSeconds(Math.Max(1, mxAccessRequestTimeoutSeconds));
|
||||
_historianRequestTimeout = TimeSpan.FromSeconds(Math.Max(1, historianRequestTimeoutSeconds));
|
||||
|
||||
if (runtimeStatusProbesEnabled)
|
||||
{
|
||||
@@ -569,7 +583,24 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
// Sync the galaxy runtime probe set against the rebuilt hierarchy. This runs
|
||||
// synchronously on the calling thread and issues AdviseSupervisory per host —
|
||||
// expected 500ms-1s additional startup latency for a large multi-host galaxy.
|
||||
_galaxyRuntimeProbeManager?.SyncAsync(hierarchy).GetAwaiter().GetResult();
|
||||
// Bounded by _mxAccessRequestTimeout so a hung probe sync cannot park the address
|
||||
// space rebuild indefinitely; on timeout we log a warning and continue with the
|
||||
// partial probe set (probe sync is advisory, not required for address space correctness).
|
||||
if (_galaxyRuntimeProbeManager != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
SyncOverAsync.WaitSync(
|
||||
_galaxyRuntimeProbeManager.SyncAsync(hierarchy),
|
||||
_mxAccessRequestTimeout,
|
||||
"GalaxyRuntimeProbeManager.SyncAsync");
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
Log.Warning(ex, "Runtime probe sync exceeded {Timeout}s; continuing with partial probe set",
|
||||
_mxAccessRequestTimeout.TotalSeconds);
|
||||
}
|
||||
}
|
||||
|
||||
_lastHierarchy = new List<GalaxyObjectInfo>(hierarchy);
|
||||
_lastAttributes = new List<GalaxyAttributeInfo>(attributes);
|
||||
@@ -854,15 +885,40 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
{
|
||||
if (string.IsNullOrEmpty(tag) || !_tagToVariableNode.ContainsKey(tag))
|
||||
continue;
|
||||
var alarmTag = tag;
|
||||
_mxAccessClient.SubscribeAsync(alarmTag, (_, _) => { })
|
||||
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
|
||||
"Failed to auto-subscribe to alarm tag {Tag}", alarmTag),
|
||||
TaskContinuationOptions.OnlyOnFaulted);
|
||||
TrackBackgroundSubscribe(tag, "alarm auto-subscribe");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Issues a fire-and-forget <c>SubscribeAsync</c> for <paramref name="tag"/> and registers
|
||||
/// the resulting task so shutdown can drain pending work with a bounded timeout. The
|
||||
/// continuation both removes the completed entry and logs faults with the supplied
|
||||
/// <paramref name="context"/>.
|
||||
/// </summary>
|
||||
private void TrackBackgroundSubscribe(string tag, string context)
|
||||
{
|
||||
if (_dispatchDisposed)
|
||||
return;
|
||||
|
||||
var id = Interlocked.Increment(ref _backgroundSubscribeCounter);
|
||||
var task = _mxAccessClient.SubscribeAsync(tag, (_, _) => { });
|
||||
_pendingBackgroundSubscribes[id] = task;
|
||||
task.ContinueWith(t =>
|
||||
{
|
||||
_pendingBackgroundSubscribes.TryRemove(id, out _);
|
||||
if (t.IsFaulted)
|
||||
Log.Warning(t.Exception?.InnerException, "Background subscribe failed ({Context}) for {Tag}",
|
||||
context, tag);
|
||||
}, TaskContinuationOptions.ExecuteSynchronously);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of background subscribe tasks currently in flight. Exposed for tests
|
||||
/// and for the status dashboard subscription panel.
|
||||
/// </summary>
|
||||
internal int PendingBackgroundSubscribeCount => _pendingBackgroundSubscribes.Count;
|
||||
|
||||
private ServiceResult OnAlarmAcknowledge(
|
||||
ISystemContext context, ConditionState condition, byte[] eventId, LocalizedText comment)
|
||||
{
|
||||
@@ -1358,11 +1414,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
{
|
||||
if (string.IsNullOrEmpty(tag) || !_tagToVariableNode.ContainsKey(tag))
|
||||
continue;
|
||||
var subtreeAlarmTag = tag;
|
||||
_mxAccessClient.SubscribeAsync(subtreeAlarmTag, (_, _) => { })
|
||||
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
|
||||
"Failed to subscribe alarm tag in subtree {Tag}", subtreeAlarmTag),
|
||||
TaskContinuationOptions.OnlyOnFaulted);
|
||||
TrackBackgroundSubscribe(tag, "subtree alarm auto-subscribe");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1705,10 +1757,18 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
|
||||
try
|
||||
{
|
||||
var vtq = _mxAccessClient.ReadAsync(tagRef).GetAwaiter().GetResult();
|
||||
var vtq = SyncOverAsync.WaitSync(
|
||||
_mxAccessClient.ReadAsync(tagRef),
|
||||
_mxAccessRequestTimeout,
|
||||
"MxAccessClient.ReadAsync");
|
||||
results[i] = CreatePublishedDataValue(tagRef, vtq);
|
||||
errors[i] = ServiceResult.Good;
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
Log.Warning(ex, "Read timed out for {TagRef}", tagRef);
|
||||
errors[i] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "Read failed for {TagRef}", tagRef);
|
||||
@@ -1779,7 +1839,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
value = updatedArray;
|
||||
}
|
||||
|
||||
var success = _mxAccessClient.WriteAsync(tagRef, value).GetAwaiter().GetResult();
|
||||
var success = SyncOverAsync.WaitSync(
|
||||
_mxAccessClient.WriteAsync(tagRef, value),
|
||||
_mxAccessRequestTimeout,
|
||||
"MxAccessClient.WriteAsync");
|
||||
if (success)
|
||||
{
|
||||
PublishLocalWrite(tagRef, value);
|
||||
@@ -1790,6 +1853,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
errors[i] = new ServiceResult(StatusCodes.BadInternalError);
|
||||
}
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
Log.Warning(ex, "Write timed out for {TagRef}", tagRef);
|
||||
errors[i] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "Write failed for {TagRef}", tagRef);
|
||||
@@ -2017,15 +2085,23 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
try
|
||||
{
|
||||
var maxValues = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
|
||||
var dataValues = _historianDataSource.ReadRawAsync(
|
||||
tagRef, details.StartTime, details.EndTime, maxValues)
|
||||
.GetAwaiter().GetResult();
|
||||
var dataValues = SyncOverAsync.WaitSync(
|
||||
_historianDataSource.ReadRawAsync(
|
||||
tagRef, details.StartTime, details.EndTime, maxValues),
|
||||
_historianRequestTimeout,
|
||||
"HistorianDataSource.ReadRawAsync");
|
||||
|
||||
if (details.ReturnBounds)
|
||||
AddBoundingValues(dataValues, details.StartTime, details.EndTime);
|
||||
|
||||
ReturnHistoryPage(dataValues, details.NumValuesPerNode, results, errors, idx);
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
Log.Warning(ex, "HistoryRead raw timed out for {TagRef}", tagRef);
|
||||
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
@@ -2094,13 +2170,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
using var historyScope = _metrics.BeginOperation("HistoryReadProcessed");
|
||||
try
|
||||
{
|
||||
var dataValues = _historianDataSource.ReadAggregateAsync(
|
||||
var dataValues = SyncOverAsync.WaitSync(
|
||||
_historianDataSource.ReadAggregateAsync(
|
||||
tagRef, details.StartTime, details.EndTime,
|
||||
details.ProcessingInterval, column)
|
||||
.GetAwaiter().GetResult();
|
||||
details.ProcessingInterval, column),
|
||||
_historianRequestTimeout,
|
||||
"HistorianDataSource.ReadAggregateAsync");
|
||||
|
||||
ReturnHistoryPage(dataValues, 0, results, errors, idx);
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
Log.Warning(ex, "HistoryRead processed timed out for {TagRef}", tagRef);
|
||||
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
@@ -2150,8 +2234,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
for (var i = 0; i < details.ReqTimes.Count; i++)
|
||||
timestamps[i] = details.ReqTimes[i];
|
||||
|
||||
var dataValues = _historianDataSource.ReadAtTimeAsync(tagRef, timestamps)
|
||||
.GetAwaiter().GetResult();
|
||||
var dataValues = SyncOverAsync.WaitSync(
|
||||
_historianDataSource.ReadAtTimeAsync(tagRef, timestamps),
|
||||
_historianRequestTimeout,
|
||||
"HistorianDataSource.ReadAtTimeAsync");
|
||||
|
||||
var historyData = new HistoryData();
|
||||
historyData.DataValues.AddRange(dataValues);
|
||||
@@ -2163,6 +2249,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
};
|
||||
errors[idx] = ServiceResult.Good;
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
Log.Warning(ex, "HistoryRead at-time timed out for {TagRef}", tagRef);
|
||||
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
@@ -2215,9 +2307,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
try
|
||||
{
|
||||
var maxEvents = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
|
||||
var events = _historianDataSource.ReadEventsAsync(
|
||||
sourceName, details.StartTime, details.EndTime, maxEvents)
|
||||
.GetAwaiter().GetResult();
|
||||
var events = SyncOverAsync.WaitSync(
|
||||
_historianDataSource.ReadEventsAsync(
|
||||
sourceName, details.StartTime, details.EndTime, maxEvents),
|
||||
_historianRequestTimeout,
|
||||
"HistorianDataSource.ReadEventsAsync");
|
||||
|
||||
var historyEvent = new HistoryEvent();
|
||||
foreach (var evt in events)
|
||||
@@ -2247,6 +2341,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
};
|
||||
errors[idx] = ServiceResult.Good;
|
||||
}
|
||||
catch (TimeoutException ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
Log.Warning(ex, "HistoryRead events timed out for {NodeId}", nodeIdStr);
|
||||
errors[idx] = new ServiceResult(StatusCodes.BadTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
historyScope.SetSuccess(false);
|
||||
@@ -2476,13 +2576,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
}
|
||||
|
||||
foreach (var tagRef in tagsToSubscribe)
|
||||
{
|
||||
var transferTag = tagRef;
|
||||
_mxAccessClient.SubscribeAsync(transferTag, (_, _) => { })
|
||||
.ContinueWith(t => Log.Warning(t.Exception?.InnerException,
|
||||
"Failed to restore subscription for transferred tag {Tag}", transferTag),
|
||||
TaskContinuationOptions.OnlyOnFaulted);
|
||||
}
|
||||
TrackBackgroundSubscribe(tagRef, "transferred subscription restore");
|
||||
}
|
||||
|
||||
private void OnMxAccessDataChange(string address, Vtq vtq)
|
||||
@@ -2798,12 +2892,33 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
// client, so the probes close cleanly.
|
||||
_galaxyRuntimeProbeManager?.Dispose();
|
||||
StopDispatchThread();
|
||||
DrainPendingBackgroundSubscribes();
|
||||
_dataChangeSignal.Dispose();
|
||||
}
|
||||
|
||||
base.Dispose(disposing);
|
||||
}
|
||||
|
||||
private void DrainPendingBackgroundSubscribes()
|
||||
{
|
||||
var snapshot = _pendingBackgroundSubscribes.Values.ToArray();
|
||||
if (snapshot.Length == 0)
|
||||
return;
|
||||
|
||||
try
|
||||
{
|
||||
Task.WaitAll(snapshot, TimeSpan.FromSeconds(5));
|
||||
Log.Information("Drained {Count} pending background subscribe(s) on shutdown", snapshot.Length);
|
||||
}
|
||||
catch (AggregateException ex)
|
||||
{
|
||||
// Individual faults were already logged by the tracked continuation; record the
|
||||
// aggregate at debug level to aid diagnosis without double-logging each failure.
|
||||
Log.Debug(ex, "Background subscribe drain completed with {FaultCount} fault(s)",
|
||||
ex.InnerExceptions.Count);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
@@ -39,6 +39,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
|
||||
private readonly bool _runtimeStatusProbesEnabled;
|
||||
private readonly int _runtimeStatusUnknownTimeoutSeconds;
|
||||
private readonly int _mxAccessRequestTimeoutSeconds;
|
||||
private readonly int _historianRequestTimeoutSeconds;
|
||||
|
||||
public LmxOpcUaServer(string galaxyName, IMxAccessClient mxAccessClient, PerformanceMetrics metrics,
|
||||
IHistorianDataSource? historianDataSource = null, bool alarmTrackingEnabled = false,
|
||||
@@ -46,7 +48,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
RedundancyConfiguration? redundancyConfig = null, string? applicationUri = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
bool runtimeStatusProbesEnabled = false,
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15)
|
||||
int runtimeStatusUnknownTimeoutSeconds = 15,
|
||||
int mxAccessRequestTimeoutSeconds = 30,
|
||||
int historianRequestTimeoutSeconds = 60)
|
||||
{
|
||||
_galaxyName = galaxyName;
|
||||
_mxAccessClient = mxAccessClient;
|
||||
@@ -60,6 +64,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_applicationUri = applicationUri;
|
||||
_runtimeStatusProbesEnabled = runtimeStatusProbesEnabled;
|
||||
_runtimeStatusUnknownTimeoutSeconds = runtimeStatusUnknownTimeoutSeconds;
|
||||
_mxAccessRequestTimeoutSeconds = mxAccessRequestTimeoutSeconds;
|
||||
_historianRequestTimeoutSeconds = historianRequestTimeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -97,7 +103,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_historianDataSource, _alarmTrackingEnabled, _authConfig.AnonymousCanWrite,
|
||||
_writeOperateRoleId, _writeTuneRoleId, _writeConfigureRoleId, _alarmAckRoleId,
|
||||
_alarmObjectFilter,
|
||||
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds);
|
||||
_runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds,
|
||||
_mxAccessRequestTimeoutSeconds, _historianRequestTimeoutSeconds);
|
||||
|
||||
var nodeManagers = new List<INodeManager> { NodeManager };
|
||||
return new MasterNodeManager(server, configuration, null, nodeManagers.ToArray());
|
||||
|
||||
@@ -46,7 +46,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
SecurityProfileConfiguration? securityConfig = null,
|
||||
RedundancyConfiguration? redundancyConfig = null,
|
||||
AlarmObjectFilter? alarmObjectFilter = null,
|
||||
MxAccessConfiguration? mxAccessConfig = null)
|
||||
MxAccessConfiguration? mxAccessConfig = null,
|
||||
HistorianConfiguration? historianConfig = null)
|
||||
{
|
||||
_config = config;
|
||||
_mxAccessClient = mxAccessClient;
|
||||
@@ -58,9 +59,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_redundancyConfig = redundancyConfig ?? new RedundancyConfiguration();
|
||||
_alarmObjectFilter = alarmObjectFilter;
|
||||
_mxAccessConfig = mxAccessConfig ?? new MxAccessConfiguration();
|
||||
_historianConfig = historianConfig ?? new HistorianConfiguration();
|
||||
}
|
||||
|
||||
private readonly MxAccessConfiguration _mxAccessConfig;
|
||||
private readonly HistorianConfiguration _historianConfig;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the active node manager that holds the published Galaxy namespace.
|
||||
@@ -245,7 +248,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
|
||||
_config.AlarmTrackingEnabled, _authConfig, _authProvider, _redundancyConfig, applicationUri,
|
||||
_alarmObjectFilter,
|
||||
_mxAccessConfig.RuntimeStatusProbesEnabled,
|
||||
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds);
|
||||
_mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds,
|
||||
_mxAccessConfig.RequestTimeoutSeconds,
|
||||
_historianConfig.RequestTimeoutSeconds);
|
||||
await _application.Start(_server);
|
||||
|
||||
Log.Information(
|
||||
|
||||
@@ -125,10 +125,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
|
||||
internal ChangeDetectionService? ChangeDetectionInstance { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the hosted status web server when the dashboard is enabled.
|
||||
/// Gets the hosted status web server when the dashboard is enabled and successfully bound.
|
||||
/// Null when <c>Dashboard.Enabled</c> is false or when <see cref="DashboardStartFailed"/> is true.
|
||||
/// </summary>
|
||||
internal StatusWebServer? StatusWeb { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets a flag indicating that the dashboard was enabled in configuration but failed to bind
|
||||
/// its HTTP port at startup. The service continues in degraded mode (matching the pattern
|
||||
/// for other optional subsystems: MxAccess connect, Galaxy DB connect, initial address space
|
||||
/// build). Surfaced for tests and any external health probe that needs to distinguish
|
||||
/// "dashboard disabled by config" from "dashboard failed to start".
|
||||
/// </summary>
|
||||
internal bool DashboardStartFailed { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the dashboard report generator used to assemble operator-facing status snapshots.
|
||||
/// </summary>
|
||||
@@ -246,7 +256,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
|
||||
|
||||
ServerHost = new OpcUaServerHost(_config.OpcUa, effectiveMxClient, Metrics, _historianDataSource,
|
||||
_config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter,
|
||||
_config.MxAccess);
|
||||
_config.MxAccess, _config.Historian);
|
||||
|
||||
// Step 9-10: Query hierarchy, start server, build address space
|
||||
DateTime? initialDeployTime = null;
|
||||
@@ -304,8 +314,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host
|
||||
|
||||
if (_config.Dashboard.Enabled)
|
||||
{
|
||||
StatusWeb = new StatusWebServer(StatusReportInstance, _config.Dashboard.Port);
|
||||
StatusWeb.Start();
|
||||
var dashboardServer = new StatusWebServer(StatusReportInstance, _config.Dashboard.Port);
|
||||
if (dashboardServer.Start())
|
||||
{
|
||||
StatusWeb = dashboardServer;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Degraded mode: StatusWebServer.Start() already logged the underlying exception.
|
||||
// Dispose the unstarted instance, null out the reference, and flag the failure so
|
||||
// tests and health probes can observe it. Service startup continues.
|
||||
Log.Warning("Status dashboard failed to bind on port {Port}; service continues without dashboard",
|
||||
_config.Dashboard.Port);
|
||||
dashboardServer.Dispose();
|
||||
DashboardStartFailed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Wire ServiceLevel updates from MXAccess health changes
|
||||
|
||||
53
src/ZB.MOM.WW.LmxOpcUa.Host/Utilities/SyncOverAsync.cs
Normal file
53
src/ZB.MOM.WW.LmxOpcUa.Host/Utilities/SyncOverAsync.cs
Normal file
@@ -0,0 +1,53 @@
|
||||
using System;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace ZB.MOM.WW.LmxOpcUa.Host.Utilities
|
||||
{
|
||||
/// <summary>
|
||||
/// Bounded safety wrappers for blocking on async tasks from synchronous OPC UA stack
|
||||
/// callbacks (Read, Write, HistoryRead*, BuildAddressSpace). These are backstops: the
|
||||
/// underlying MxAccess / Historian clients already enforce inner timeouts on the async
|
||||
/// path, but an outer bound is still required so the stack thread cannot be parked
|
||||
/// indefinitely by a hung scheduler, a slow reconnect, or any other non-returning
|
||||
/// async path.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// On timeout, the underlying task is NOT cancelled — it runs to completion on the
|
||||
/// thread pool and is abandoned. Callers must be comfortable with the fire-forget
|
||||
/// semantics of the background continuation. This is acceptable for the current call
|
||||
/// sites because MxAccess and Historian clients are shared singletons whose background
|
||||
/// work does not capture request-scoped state.
|
||||
/// </remarks>
|
||||
internal static class SyncOverAsync
|
||||
{
|
||||
public static void WaitSync(Task task, TimeSpan timeout, string operation)
|
||||
{
|
||||
if (task == null) throw new ArgumentNullException(nameof(task));
|
||||
try
|
||||
{
|
||||
if (!task.Wait(timeout))
|
||||
throw new TimeoutException($"{operation} exceeded {timeout.TotalSeconds:0.#}s");
|
||||
}
|
||||
catch (AggregateException ae) when (ae.InnerExceptions.Count == 1)
|
||||
{
|
||||
// Unwrap the single inner exception so callers can write natural catch blocks.
|
||||
throw ae.InnerExceptions[0];
|
||||
}
|
||||
}
|
||||
|
||||
public static T WaitSync<T>(Task<T> task, TimeSpan timeout, string operation)
|
||||
{
|
||||
if (task == null) throw new ArgumentNullException(nameof(task));
|
||||
try
|
||||
{
|
||||
if (!task.Wait(timeout))
|
||||
throw new TimeoutException($"{operation} exceeded {timeout.TotalSeconds:0.#}s");
|
||||
return task.Result;
|
||||
}
|
||||
catch (AggregateException ae) when (ae.InnerExceptions.Count == 1)
|
||||
{
|
||||
throw ae.InnerExceptions[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user