Stop MxAccess from overwriting Bad quality on stopped-host variables: suppress pending data changes at dispatch, guard cross-host clear from wiping sibling state, and silence the Unknown→Running startup callback so recovering DevPlatform can no longer reset variables that a still-stopped DevAppEngine marked Bad.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 23:22:28 -04:00
parent 4b209f64bb
commit 731092595f
4 changed files with 222 additions and 41 deletions

View File

@@ -1,5 +1,7 @@
using System.Collections.Concurrent;
using CliFx.Attributes;
using CliFx.Infrastructure;
using Opc.Ua;
using ZB.MOM.WW.LmxOpcUa.Client.CLI.Helpers;
using ZB.MOM.WW.LmxOpcUa.Client.Shared;
@@ -8,30 +10,31 @@ namespace ZB.MOM.WW.LmxOpcUa.Client.CLI.Commands;
[Command("subscribe", Description = "Monitor a node for value changes")]
public class SubscribeCommand : CommandBase
{
/// <summary>
/// Creates the live-data subscription command used to watch runtime value changes from the terminal.
/// </summary>
/// <param name="factory">The factory that creates the shared client service for the command run.</param>
public SubscribeCommand(IOpcUaClientServiceFactory factory) : base(factory)
{
}
/// <summary>
/// Gets the node whose live value changes should be monitored.
/// </summary>
[CommandOption("node", 'n', Description = "Node ID to monitor", IsRequired = true)]
public string NodeId { get; init; } = default!;
/// <summary>
/// Gets the sampling interval, in milliseconds, for the monitored item.
/// </summary>
[CommandOption("interval", 'i', Description = "Sampling interval in milliseconds")]
public int Interval { get; init; } = 1000;
/// <summary>
/// Connects to the server and streams live data-change notifications for the requested node.
/// </summary>
/// <param name="console">The CLI console used for output and cancellation handling.</param>
[CommandOption("recursive", 'r', Description = "Browse recursively from --node and subscribe to every Variable found")]
public bool Recursive { get; init; }
[CommandOption("max-depth", Description = "Maximum recursion depth when --recursive is set")]
public int MaxDepth { get; init; } = 10;
[CommandOption("quiet", 'q', Description = "Suppress per-update output; only print a final summary on Ctrl+C")]
public bool Quiet { get; init; }
[CommandOption("duration", Description = "Auto-exit after N seconds and print summary (0 = run until Ctrl+C)")]
public int DurationSeconds { get; init; } = 0;
[CommandOption("summary-file", Description = "Write summary to this file path on exit (in addition to stdout)")]
public string? SummaryFile { get; init; }
public override async ValueTask ExecuteAsync(IConsole console)
{
ConfigureLogging();
@@ -41,30 +44,125 @@ public class SubscribeCommand : CommandBase
var ct = console.RegisterCancellationHandler();
(service, _) = await CreateServiceAndConnectAsync(ct);
var nodeId = NodeIdParser.ParseRequired(NodeId);
var rootNodeId = NodeIdParser.ParseRequired(NodeId);
var targets = new List<(NodeId nodeId, string displayPath)>();
if (Recursive)
{
await console.Output.WriteLineAsync($"Browsing subtree of {NodeId} (max depth {MaxDepth})...");
await CollectVariablesAsync(service, rootNodeId, NodeId, MaxDepth, 0, targets, ct);
await console.Output.WriteLineAsync($"Found {targets.Count} variable nodes.");
}
else
{
targets.Add((rootNodeId, NodeId));
}
var lastStatus = new ConcurrentDictionary<string, (StatusCode Status, DateTime LastUpdate, object? Value)>();
var updateCount = new ConcurrentDictionary<string, int>();
var everBad = new ConcurrentDictionary<string, byte>();
var displayNameByNodeId = targets.ToDictionary(t => t.nodeId.ToString(), t => t.displayPath);
service.DataChanged += (_, e) =>
{
var key = e.NodeId.ToString();
lastStatus[key] = (e.Value.StatusCode, DateTime.UtcNow, e.Value.Value);
updateCount.AddOrUpdate(key, 1, (_, v) => v + 1);
if (!StatusCode.IsGood(e.Value.StatusCode))
everBad.TryAdd(key, 0);
if (!Quiet)
{
console.Output.WriteLine(
$"[{e.Value.SourceTimestamp:O}] {e.NodeId} = {e.Value.Value} ({e.Value.StatusCode})");
$"[{e.Value.SourceTimestamp:O}] {displayNameByNodeId.GetValueOrDefault(key, key)} = {e.Value.Value} ({e.Value.StatusCode})");
}
};
await service.SubscribeAsync(nodeId, Interval, ct);
await console.Output.WriteLineAsync(
$"Subscribed to {NodeId} (interval: {Interval}ms). Press Ctrl+C to stop.");
// Wait until cancellation
var subscribed = 0;
foreach (var (nodeId, _) in targets)
{
try
{
await service.SubscribeAsync(nodeId, Interval, ct);
subscribed++;
}
catch (Exception ex)
{
await console.Output.WriteLineAsync($" FAILED to subscribe {nodeId}: {ex.Message}");
}
}
await console.Output.WriteLineAsync(
$"Subscribed to {subscribed}/{targets.Count} nodes (interval: {Interval}ms). Press Ctrl+C to stop and print summary.");
try
{
if (DurationSeconds > 0)
await Task.Delay(TimeSpan.FromSeconds(DurationSeconds), ct);
else
await Task.Delay(Timeout.Infinite, ct);
}
catch (OperationCanceledException)
{
// Expected on Ctrl+C
}
await service.UnsubscribeAsync(nodeId);
await console.Output.WriteLineAsync("Unsubscribed.");
// Summary
var summary = new List<string>();
summary.Add("");
summary.Add("==================== SUMMARY ====================");
var good = new List<string>();
var bad = new List<string>();
var never = new List<string>();
foreach (var (nodeId, display) in targets)
{
var key = nodeId.ToString();
if (!lastStatus.TryGetValue(key, out var entry))
{
never.Add(display);
continue;
}
if (StatusCode.IsGood(entry.Status))
good.Add($"{display} = {entry.Value} ({entry.Status})");
else
bad.Add($"{display} = {entry.Value} ({entry.Status})");
}
var neverWentBad = targets
.Where(t => !everBad.ContainsKey(t.nodeId.ToString()))
.Select(t => t.displayPath)
.ToList();
var didGoBad = targets.Count - neverWentBad.Count;
summary.Add($"Total subscribed: {targets.Count}");
summary.Add($" Ever went BAD during window: {didGoBad}");
summary.Add($" NEVER went bad (suspect): {neverWentBad.Count}");
summary.Add($" Last status GOOD: {good.Count}");
summary.Add($" Last status NOT-GOOD: {bad.Count}");
summary.Add($" No update received at all: {never.Count}");
if (neverWentBad.Count > 0 && neverWentBad.Count < targets.Count)
{
summary.Add("");
summary.Add("--- Nodes that NEVER received a bad-quality update (suspect) ---");
foreach (var line in neverWentBad) summary.Add($" {line}");
}
if (never.Count > 0)
{
summary.Add("");
summary.Add("--- Nodes that never received an update at all ---");
foreach (var line in never) summary.Add($" {line}");
}
foreach (var line in summary) await console.Output.WriteLineAsync(line);
if (!string.IsNullOrEmpty(SummaryFile))
{
try { await File.WriteAllLinesAsync(SummaryFile, summary); }
catch (Exception ex) { await console.Output.WriteLineAsync($"Failed to write summary file: {ex.Message}"); }
}
foreach (var (nodeId, _) in targets)
{
try { await service.UnsubscribeAsync(nodeId); } catch { /* ignore */ }
}
}
finally
{
@@ -75,4 +173,31 @@ public class SubscribeCommand : CommandBase
}
}
}
private static async Task CollectVariablesAsync(
IOpcUaClientService service,
NodeId? parent,
string parentPath,
int maxDepth,
int currentDepth,
List<(NodeId nodeId, string displayPath)> into,
CancellationToken ct)
{
if (currentDepth >= maxDepth) return;
var children = await service.BrowseAsync(parent, ct);
foreach (var child in children)
{
var nodeId = NodeIdParser.Parse(child.NodeId);
if (nodeId is null) continue;
var childPath = $"{parentPath}/{child.DisplayName}";
if (child.NodeClass == "Variable")
{
into.Add((nodeId, childPath));
}
if (child.HasChildren)
{
await CollectVariablesAsync(service, nodeId, childPath, maxDepth, currentDepth + 1, into, ct);
}
}
}
}

View File

@@ -271,12 +271,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess
status.LastError = null;
if (status.State != GalaxyRuntimeState.Running)
{
// Only fire the host-running callback on a true Stopped → Running
// recovery. Unknown → Running happens once at startup for every host
// and is not a recovery — firing ClearHostVariablesBadQuality there
// would wipe Bad status set by the concurrently-stopping other host
// on variables that span both lists.
var wasStopped = status.State == GalaxyRuntimeState.Stopped;
status.State = GalaxyRuntimeState.Running;
status.LastStateChangeTime = now;
if (wasStopped)
{
transitionTo = GalaxyRuntimeState.Running;
fromToGobjectId = status.GobjectId;
}
}
}
else
{
status.FailureCount++;

View File

@@ -108,6 +108,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
private readonly NodeId? _writeOperateRoleId;
private readonly NodeId? _writeTuneRoleId;
private long _dispatchCycleCount;
private long _suppressedUpdatesCount;
private volatile bool _dispatchDisposed;
private volatile bool _dispatchRunning;
private Thread? _dispatchThread;
@@ -790,24 +791,53 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
/// <param name="gobjectId">The runtime host's gobject_id.</param>
public void ClearHostVariablesBadQuality(int gobjectId)
{
List<BaseDataVariableState>? variables;
var clearedCount = 0;
var skippedCount = 0;
lock (Lock)
{
if (!_hostedVariables.TryGetValue(gobjectId, out variables))
return;
var now = DateTime.UtcNow;
foreach (var variable in variables)
// Iterate the full tag → host-list map so we can skip variables whose other
// ancestor hosts are still Stopped. Mass-clearing _hostedVariables[gobjectId]
// would wipe Bad status set by a concurrently-stopped sibling host (e.g.
// recovering DevPlatform must not clear variables that also live under a
// still-stopped DevAppEngine).
foreach (var kv in _hostIdsByTagRef)
{
var hostIds = kv.Value;
if (!hostIds.Contains(gobjectId))
continue;
var anotherStopped = false;
for (var i = 0; i < hostIds.Count; i++)
{
if (hostIds[i] == gobjectId)
continue;
if (_galaxyRuntimeProbeManager != null &&
_galaxyRuntimeProbeManager.IsHostStopped(hostIds[i]))
{
anotherStopped = true;
break;
}
}
if (anotherStopped)
{
skippedCount++;
continue;
}
if (_tagToVariableNode.TryGetValue(kv.Key, out var variable))
{
variable.StatusCode = StatusCodes.Good;
variable.Timestamp = now;
variable.ClearChangeMasks(SystemContext, false);
clearedCount++;
}
}
}
Log.Information(
"Cleared bad-quality override on {Count} variable(s) for recovered host gobject_id={GobjectId}",
variables.Count, gobjectId);
"Cleared bad-quality override on {Count} variable(s) for recovered host gobject_id={GobjectId} (skipped {Skipped} with other stopped ancestors)",
clearedCount, gobjectId, skippedCount);
}
private void SubscribeAlarmTags()
@@ -2554,6 +2584,18 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (!_pendingDataChanges.TryRemove(address, out var vtq))
continue;
// Suppress updates for tags whose owning Galaxy runtime host is currently
// Stopped. Without this, MxAccess keeps streaming cached values that would
// overwrite the BadOutOfService set by MarkHostVariablesBadQuality — the
// variables would flicker Bad→Good every dispatch cycle and subscribers
// would see a flood of notifications (the original "client freeze" symptom).
// Dropping at the source also means we do no lock/alarm work for dead data.
if (IsTagUnderStoppedHost(address))
{
Interlocked.Increment(ref _suppressedUpdatesCount);
continue;
}
AlarmInfo? alarmInfo = null;
AlarmInfo? ackedAlarmInfo = null;
var newInAlarm = false;
@@ -2728,6 +2770,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
var batchSize = Interlocked.Read(ref _totalDispatchBatchSize);
var cycles = Interlocked.Read(ref _dispatchCycleCount);
var avgQueueSize = cycles > 0 ? (double)batchSize / cycles : 0;
var suppressed = Interlocked.Exchange(ref _suppressedUpdatesCount, 0);
// Reset rolling counters
Interlocked.Exchange(ref _totalDispatchBatchSize, 0);
@@ -2738,8 +2781,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
AverageDispatchBatchSize = avgQueueSize;
Log.Information(
"DataChange dispatch: EventsPerSec={EventsPerSec:F1}, AvgBatchSize={AvgBatchSize:F1}, PendingItems={Pending}, TotalEvents={Total}",
eventsPerSecond, avgQueueSize, _pendingDataChanges.Count, totalEvents);
"DataChange dispatch: EventsPerSec={EventsPerSec:F1}, AvgBatchSize={AvgBatchSize:F1}, PendingItems={Pending}, TotalEvents={Total}, SuppressedStopped={Suppressed}",
eventsPerSecond, avgQueueSize, _pendingDataChanges.Count, totalEvents, suppressed);
}
/// <inheritdoc />

View File

@@ -58,7 +58,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.MxAccess
entry.GoodUpdateCount.ShouldBe(1);
entry.FailureCount.ShouldBe(0);
entry.LastError.ShouldBeNull();
runSpy.ShouldBe(new[] { 20 });
// Unknown → Running is startup initialization, not a recovery — the onHostRunning
// callback is reserved for Stopped → Running transitions so the node manager does
// not wipe Bad status set by a concurrently-stopping sibling host on the same variable.
runSpy.ShouldBeEmpty();
stopSpy.ShouldBeEmpty();
}
@@ -129,7 +132,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.MxAccess
sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true));
sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true));
runSpy.Count.ShouldBe(1); // only the Unknown → Running call fires the callback
// Unknown → Running is silent; subsequent Running updates are idempotent.
runSpy.ShouldBeEmpty();
sut.GetSnapshot().Single().GoodUpdateCount.ShouldBe(3);
}