Surface historian plugin and alarm-tracking health in the status dashboard so operators can detect misconfiguration and runtime degradation that previously showed as fully healthy

Wraps the 4 HistoryRead overrides and OnAlarmAcknowledge with PerformanceMetrics.BeginOperation, adds alarm counters to LmxNodeManager, publishes a structured HistorianPluginOutcome from HistorianPluginLoader, and extends HealthCheckService with plugin-load, history-read, and alarm-ack-failure degradation rules.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-12 15:52:03 -04:00
parent 9b42b61eb6
commit c5ed5312a9
10 changed files with 647 additions and 26 deletions

View File

@@ -73,6 +73,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
// Dispatch queue metrics
private long _totalMxChangeEvents;
// Alarm instrumentation counters
private long _alarmTransitionCount;
private long _alarmAckEventCount;
private long _alarmAckWriteFailures;
/// <summary>
/// Initializes a new node manager for the Galaxy-backed OPC UA namespace.
/// </summary>
@@ -151,6 +156,47 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
/// </summary>
public double AverageDispatchBatchSize { get; private set; }
/// <summary>
/// Gets a value indicating whether alarm condition tracking is enabled for this node manager.
/// </summary>
public bool AlarmTrackingEnabled => _alarmTrackingEnabled;
/// <summary>
/// Gets the number of distinct alarm conditions currently tracked (one per alarm attribute).
/// </summary>
public int AlarmConditionCount => _alarmInAlarmTags.Count;
/// <summary>
/// Gets the number of alarms currently in the InAlarm=true state.
/// </summary>
public int ActiveAlarmCount => CountActiveAlarms();
/// <summary>
/// Gets the total number of InAlarm transition events observed in the dispatch loop since startup.
/// </summary>
public long AlarmTransitionCount => Interlocked.Read(ref _alarmTransitionCount);
/// <summary>
/// Gets the total number of alarm acknowledgement transition events observed since startup.
/// </summary>
public long AlarmAckEventCount => Interlocked.Read(ref _alarmAckEventCount);
/// <summary>
/// Gets the total number of MXAccess AckMsg writes that failed while processing alarm acknowledges.
/// </summary>
public long AlarmAckWriteFailures => Interlocked.Read(ref _alarmAckWriteFailures);
private int CountActiveAlarms()
{
var count = 0;
lock (Lock)
{
foreach (var info in _alarmInAlarmTags.Values)
if (info.LastInAlarm) count++;
}
return count;
}
/// <inheritdoc />
public override void CreateAddressSpace(IDictionary<NodeId, IList<IReference>> externalReferences)
{
@@ -421,6 +467,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (alarmInfo == null)
return new ServiceResult(StatusCodes.BadNodeIdUnknown);
using var scope = _metrics.BeginOperation("AlarmAcknowledge");
try
{
var ackMessage = comment?.Text ?? "";
@@ -432,6 +479,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
catch (Exception ex)
{
scope.SetSuccess(false);
Interlocked.Increment(ref _alarmAckWriteFailures);
Log.Warning(ex, "Failed to write AckMsg for {Source}", alarmInfo.SourceName);
return new ServiceResult(StatusCodes.BadInternalError);
}
@@ -1522,6 +1571,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
continue;
}
using var historyScope = _metrics.BeginOperation("HistoryReadRaw");
try
{
var maxValues = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
@@ -1536,6 +1586,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead raw failed for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadInternalError);
}
@@ -1598,6 +1649,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
continue;
}
using var historyScope = _metrics.BeginOperation("HistoryReadProcessed");
try
{
var dataValues = _historianDataSource.ReadAggregateAsync(
@@ -1609,6 +1661,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead processed failed for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadInternalError);
}
@@ -1648,6 +1701,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
continue;
}
using var historyScope = _metrics.BeginOperation("HistoryReadAtTime");
try
{
var timestamps = new DateTime[details.ReqTimes.Count];
@@ -1669,6 +1723,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead at-time failed for {TagRef}", tagRef);
errors[idx] = new ServiceResult(StatusCodes.BadInternalError);
}
@@ -1714,6 +1769,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
}
using var historyScope = _metrics.BeginOperation("HistoryReadEvents");
try
{
var maxEvents = details.NumValuesPerNode > 0 ? (int)details.NumValuesPerNode : 0;
@@ -1751,6 +1807,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
catch (Exception ex)
{
historyScope.SetSuccess(false);
Log.Warning(ex, "HistoryRead events failed for {NodeId}", nodeIdStr);
errors[idx] = new ServiceResult(StatusCodes.BadInternalError);
}
@@ -2107,7 +2164,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
if (ackedAlarmInfo.LastAcked.HasValue && newAcked == ackedAlarmInfo.LastAcked.Value)
ackedAlarmInfo = null; // No transition → skip
else
{
pendingAckedEvents.Add((ackedAlarmInfo, newAcked));
Interlocked.Increment(ref _alarmAckEventCount);
}
}
}
@@ -2127,6 +2187,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa
}
pendingAlarmEvents.Add((address, alarmInfo, newInAlarm, severity, message));
Interlocked.Increment(ref _alarmTransitionCount);
}
// Apply under Lock so ClearChangeMasks propagates to monitored items.