Both VirtualTagEngine and ScriptedAlarmEngine share a pattern: the BuildReadCache helper iterates the script's declared input set, reading from _valueCache with a fallback to _upstream.ReadTag. When an upstream tag hasn't yet delivered its first subscription push, ReadTag returns a DataValueSnapshot with a null Value and BadNotConnected quality. User scripts then cast `(double)ctx.GetTag(path).Value` unconditionally and throw NullReferenceException — once per evaluation tick until the cache fills, spamming the log with identical stack traces. The existing catch block recovered (kept the prior state) but didn't silence the churn. Add AreInputsReady(cache) to both engines: return true only when every entry has a non-null Value and a non-Bad StatusCode (Good + Uncertain are both considered ready). Skip script evaluation when the check returns false — the engine holds the prior state (alarm) or the prior snapshot (virtual tag) until upstream delivers. Eliminates the cold- start NRE spam at root without changing the script-engine contract. Also: fix $changeLines.Count in test-galaxy.ps1 — PowerShell's Set-StrictMode -Version 3.0 errors on .Count when Where-Object returns 0 or 1 items. Wrap in `@(...)` to force an array; same pattern the sibling _common.ps1 already uses in Write-Summary for the same reason. Task #112 — the Galaxy live E2E now passes 3/7 stages (probe + source read + reverse-bridge-ACL). The remaining 4 stages (virtual-tag, subscribe-sees-change, alarm-fires, history-read) are deployment- specific: MoveInBatchID is idle in this Galaxy + its AccessLevel blocks writes + it's not historized. Cold-start behaviour is now correct, so once the seed points at a live attribute those stages should light up. Tests: 36/36 VirtualTags.Tests + 47/47 ScriptedAlarms.Tests green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
460 lines
20 KiB
C#
460 lines
20 KiB
C#
using System.Collections.Concurrent;
|
|
using Serilog;
|
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
|
using ZB.MOM.WW.OtOpcUa.Core.Scripting;
|
|
|
|
namespace ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms;
|
|
|
|
/// <summary>
|
|
/// Phase 7 scripted-alarm orchestrator. Compiles every configured alarm's predicate
|
|
/// against the Stream A sandbox, subscribes to the referenced upstream tags,
|
|
/// re-evaluates the predicate on every input change + on a shelving-check timer,
|
|
/// applies the resulting transition through <see cref="Part9StateMachine"/>,
|
|
/// persists state via <see cref="IAlarmStateStore"/>, and emits the resulting events
|
|
/// through <see cref="ScriptedAlarmSource"/> (which wires into the existing
|
|
/// <c>IAlarmSource</c> fan-out).
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// Scripted alarms are leaves in the evaluation DAG — no alarm's state drives
|
|
/// another alarm's predicate. The engine maintains only an inverse index from
|
|
/// upstream tag path → alarms referencing it; no topological sort needed
|
|
/// (unlike the virtual-tag engine).
|
|
/// </para>
|
|
/// <para>
|
|
/// Evaluation errors (script throws, timeout, coercion fail) surface as
|
|
/// structured errors in the dedicated scripts-*.log sink plus a WARN companion
|
|
/// in the main log. The alarm's ActiveState stays at its prior value — the
|
|
/// engine does NOT invent a clear transition just because the predicate broke.
|
|
/// Operators investigating a broken predicate shouldn't see a phantom
|
|
/// clear-event preceding the failure.
|
|
/// </para>
|
|
/// </remarks>
|
|
public sealed class ScriptedAlarmEngine : IDisposable
|
|
{
|
|
private readonly ITagUpstreamSource _upstream;
|
|
private readonly IAlarmStateStore _store;
|
|
private readonly ScriptLoggerFactory _loggerFactory;
|
|
private readonly ILogger _engineLogger;
|
|
private readonly Func<DateTime> _clock;
|
|
private readonly TimeSpan _scriptTimeout;
|
|
|
|
private readonly Dictionary<string, AlarmState> _alarms = new(StringComparer.Ordinal);
|
|
private readonly ConcurrentDictionary<string, DataValueSnapshot> _valueCache
|
|
= new(StringComparer.Ordinal);
|
|
private readonly Dictionary<string, HashSet<string>> _alarmsReferencing
|
|
= new(StringComparer.Ordinal); // tag path -> alarm ids
|
|
|
|
private readonly List<IDisposable> _upstreamSubscriptions = [];
|
|
private readonly SemaphoreSlim _evalGate = new(1, 1);
|
|
private Timer? _shelvingTimer;
|
|
private bool _loaded;
|
|
private bool _disposed;
|
|
|
|
public ScriptedAlarmEngine(
|
|
ITagUpstreamSource upstream,
|
|
IAlarmStateStore store,
|
|
ScriptLoggerFactory loggerFactory,
|
|
ILogger engineLogger,
|
|
Func<DateTime>? clock = null,
|
|
TimeSpan? scriptTimeout = null)
|
|
{
|
|
_upstream = upstream ?? throw new ArgumentNullException(nameof(upstream));
|
|
_store = store ?? throw new ArgumentNullException(nameof(store));
|
|
_loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
|
|
_engineLogger = engineLogger ?? throw new ArgumentNullException(nameof(engineLogger));
|
|
_clock = clock ?? (() => DateTime.UtcNow);
|
|
_scriptTimeout = scriptTimeout ?? TimedScriptEvaluator<AlarmPredicateContext, bool>.DefaultTimeout;
|
|
}
|
|
|
|
/// <summary>Raised for every emission the Part9StateMachine produces that the engine should publish.</summary>
|
|
public event EventHandler<ScriptedAlarmEvent>? OnEvent;
|
|
|
|
public IReadOnlyCollection<string> LoadedAlarmIds => _alarms.Keys;
|
|
|
|
/// <summary>
|
|
/// Load a batch of alarm definitions. Compiles every predicate, aggregates any
|
|
/// compile failures into one <see cref="InvalidOperationException"/>, subscribes
|
|
/// to upstream input tags, seeds the value cache, loads persisted state from
|
|
/// the store (falling back to Fresh for first-load alarms), and recomputes
|
|
/// ActiveState per Phase 7 plan decision #14 (startup recovery).
|
|
/// </summary>
|
|
public async Task LoadAsync(IReadOnlyList<ScriptedAlarmDefinition> definitions, CancellationToken ct)
|
|
{
|
|
if (_disposed) throw new ObjectDisposedException(nameof(ScriptedAlarmEngine));
|
|
if (definitions is null) throw new ArgumentNullException(nameof(definitions));
|
|
|
|
await _evalGate.WaitAsync(ct).ConfigureAwait(false);
|
|
try
|
|
{
|
|
UnsubscribeFromUpstream();
|
|
_alarms.Clear();
|
|
_alarmsReferencing.Clear();
|
|
|
|
var compileFailures = new List<string>();
|
|
foreach (var def in definitions)
|
|
{
|
|
try
|
|
{
|
|
var extraction = DependencyExtractor.Extract(def.PredicateScriptSource);
|
|
if (!extraction.IsValid)
|
|
{
|
|
var joined = string.Join("; ", extraction.Rejections.Select(r => r.Message));
|
|
compileFailures.Add($"{def.AlarmId}: dependency extraction rejected — {joined}");
|
|
continue;
|
|
}
|
|
|
|
var evaluator = ScriptEvaluator<AlarmPredicateContext, bool>.Compile(def.PredicateScriptSource);
|
|
var timed = new TimedScriptEvaluator<AlarmPredicateContext, bool>(evaluator, _scriptTimeout);
|
|
var logger = _loggerFactory.Create(def.AlarmId);
|
|
|
|
var templateTokens = MessageTemplate.ExtractTokenPaths(def.MessageTemplate);
|
|
var allInputs = new HashSet<string>(extraction.Reads, StringComparer.Ordinal);
|
|
foreach (var t in templateTokens) allInputs.Add(t);
|
|
|
|
_alarms[def.AlarmId] = new AlarmState(def, timed, extraction.Reads, templateTokens, logger,
|
|
AlarmConditionState.Fresh(def.AlarmId, _clock()));
|
|
|
|
foreach (var path in allInputs)
|
|
{
|
|
if (!_alarmsReferencing.TryGetValue(path, out var set))
|
|
_alarmsReferencing[path] = set = new HashSet<string>(StringComparer.Ordinal);
|
|
set.Add(def.AlarmId);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
compileFailures.Add($"{def.AlarmId}: {ex.Message}");
|
|
}
|
|
}
|
|
|
|
if (compileFailures.Count > 0)
|
|
{
|
|
throw new InvalidOperationException(
|
|
$"ScriptedAlarmEngine load failed. {compileFailures.Count} alarm(s) did not compile:\n "
|
|
+ string.Join("\n ", compileFailures));
|
|
}
|
|
|
|
// Seed the value cache with current upstream values + subscribe for changes.
|
|
foreach (var path in _alarmsReferencing.Keys)
|
|
{
|
|
_valueCache[path] = _upstream.ReadTag(path);
|
|
_upstreamSubscriptions.Add(_upstream.SubscribeTag(path, OnUpstreamChange));
|
|
}
|
|
|
|
// Restore persisted state, falling back to Fresh where nothing was saved,
|
|
// then re-derive ActiveState from the current predicate per decision #14.
|
|
foreach (var (alarmId, state) in _alarms)
|
|
{
|
|
var persisted = await _store.LoadAsync(alarmId, ct).ConfigureAwait(false);
|
|
var seed = persisted ?? state.Condition;
|
|
var afterPredicate = await EvaluatePredicateToStateAsync(state, seed, nowUtc: _clock(), ct)
|
|
.ConfigureAwait(false);
|
|
_alarms[alarmId] = state with { Condition = afterPredicate };
|
|
await _store.SaveAsync(afterPredicate, ct).ConfigureAwait(false);
|
|
}
|
|
|
|
_loaded = true;
|
|
_engineLogger.Information("ScriptedAlarmEngine loaded {Count} alarm(s)", _alarms.Count);
|
|
|
|
// Start the shelving-check timer — ticks every 5s, expires any timed shelves
|
|
// that have passed their UnshelveAtUtc.
|
|
_shelvingTimer = new Timer(_ => RunShelvingCheck(),
|
|
null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
|
|
}
|
|
finally
|
|
{
|
|
_evalGate.Release();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Current persisted state for <paramref name="alarmId"/>. Returns null for
|
|
/// unknown alarm. Mainly used for diagnostics + the Admin UI status page.
|
|
/// </summary>
|
|
public AlarmConditionState? GetState(string alarmId)
|
|
=> _alarms.TryGetValue(alarmId, out var s) ? s.Condition : null;
|
|
|
|
public IReadOnlyCollection<AlarmConditionState> GetAllStates()
|
|
=> _alarms.Values.Select(a => a.Condition).ToArray();
|
|
|
|
public Task AcknowledgeAsync(string alarmId, string user, string? comment, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyAcknowledge(cur, user, comment, _clock()));
|
|
|
|
public Task ConfirmAsync(string alarmId, string user, string? comment, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyConfirm(cur, user, comment, _clock()));
|
|
|
|
public Task OneShotShelveAsync(string alarmId, string user, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyOneShotShelve(cur, user, _clock()));
|
|
|
|
public Task TimedShelveAsync(string alarmId, string user, DateTime unshelveAtUtc, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyTimedShelve(cur, user, unshelveAtUtc, _clock()));
|
|
|
|
public Task UnshelveAsync(string alarmId, string user, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyUnshelve(cur, user, _clock()));
|
|
|
|
public Task EnableAsync(string alarmId, string user, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyEnable(cur, user, _clock()));
|
|
|
|
public Task DisableAsync(string alarmId, string user, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyDisable(cur, user, _clock()));
|
|
|
|
public Task AddCommentAsync(string alarmId, string user, string text, CancellationToken ct)
|
|
=> ApplyAsync(alarmId, ct, cur => Part9StateMachine.ApplyAddComment(cur, user, text, _clock()));
|
|
|
|
private async Task ApplyAsync(string alarmId, CancellationToken ct, Func<AlarmConditionState, TransitionResult> op)
|
|
{
|
|
EnsureLoaded();
|
|
if (!_alarms.TryGetValue(alarmId, out var state))
|
|
throw new ArgumentException($"Unknown alarm {alarmId}", nameof(alarmId));
|
|
|
|
await _evalGate.WaitAsync(ct).ConfigureAwait(false);
|
|
try
|
|
{
|
|
var result = op(state.Condition);
|
|
_alarms[alarmId] = state with { Condition = result.State };
|
|
await _store.SaveAsync(result.State, ct).ConfigureAwait(false);
|
|
if (result.Emission != EmissionKind.None) EmitEvent(state, result.State, result.Emission);
|
|
}
|
|
finally { _evalGate.Release(); }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Upstream-change callback. Updates the value cache + enqueues predicate
|
|
/// re-evaluation for every alarm referencing the changed path. Fire-and-forget
|
|
/// so driver-side dispatch isn't blocked.
|
|
/// </summary>
|
|
internal void OnUpstreamChange(string path, DataValueSnapshot value)
|
|
{
|
|
_valueCache[path] = value;
|
|
if (_alarmsReferencing.TryGetValue(path, out var alarmIds))
|
|
{
|
|
_ = ReevaluateAsync(alarmIds.ToArray(), CancellationToken.None);
|
|
}
|
|
}
|
|
|
|
private async Task ReevaluateAsync(IReadOnlyList<string> alarmIds, CancellationToken ct)
|
|
{
|
|
try
|
|
{
|
|
await _evalGate.WaitAsync(ct).ConfigureAwait(false);
|
|
try
|
|
{
|
|
foreach (var id in alarmIds)
|
|
{
|
|
if (!_alarms.TryGetValue(id, out var state)) continue;
|
|
var newState = await EvaluatePredicateToStateAsync(
|
|
state, state.Condition, _clock(), ct).ConfigureAwait(false);
|
|
if (!ReferenceEquals(newState, state.Condition))
|
|
{
|
|
_alarms[id] = state with { Condition = newState };
|
|
await _store.SaveAsync(newState, ct).ConfigureAwait(false);
|
|
}
|
|
}
|
|
}
|
|
finally { _evalGate.Release(); }
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_engineLogger.Error(ex, "ScriptedAlarmEngine reevaluate failed");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Evaluate the predicate + apply the resulting state-machine transition.
|
|
/// Returns the new condition state. Emits the appropriate event if the
|
|
/// transition produces one.
|
|
/// </summary>
|
|
private async Task<AlarmConditionState> EvaluatePredicateToStateAsync(
|
|
AlarmState state, AlarmConditionState seed, DateTime nowUtc, CancellationToken ct)
|
|
{
|
|
var inputs = BuildReadCache(state.Inputs);
|
|
|
|
// Cold-start guard — skip the predicate when any referenced upstream tag has no
|
|
// cached value yet (the upstream subscription hasn't delivered its first push).
|
|
// Without this, predicates that cast `(double)ctx.GetTag(path).Value` throw NRE on
|
|
// every tick until the cache fills, spamming the log with identical stack traces.
|
|
// Bad quality is treated the same: the input isn't available at the predicate's
|
|
// expected type, so the only defensible move is to hold the prior condition state.
|
|
if (!AreInputsReady(inputs)) return seed;
|
|
|
|
var context = new AlarmPredicateContext(inputs, state.Logger, _clock);
|
|
|
|
bool predicateTrue;
|
|
try
|
|
{
|
|
predicateTrue = await state.Evaluator.RunAsync(context, ct).ConfigureAwait(false);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
throw;
|
|
}
|
|
catch (ScriptTimeoutException tex)
|
|
{
|
|
state.Logger.Warning("Alarm predicate timed out after {Timeout} — state unchanged", tex.Timeout);
|
|
return seed;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
state.Logger.Error(ex, "Alarm predicate threw — state unchanged");
|
|
return seed;
|
|
}
|
|
|
|
var result = Part9StateMachine.ApplyPredicate(seed, predicateTrue, nowUtc);
|
|
if (result.Emission != EmissionKind.None)
|
|
EmitEvent(state, result.State, result.Emission);
|
|
return result.State;
|
|
}
|
|
|
|
private IReadOnlyDictionary<string, DataValueSnapshot> BuildReadCache(IReadOnlySet<string> inputs)
|
|
{
|
|
var d = new Dictionary<string, DataValueSnapshot>(StringComparer.Ordinal);
|
|
foreach (var p in inputs)
|
|
d[p] = _valueCache.TryGetValue(p, out var v) ? v : _upstream.ReadTag(p);
|
|
return d;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns true when every entry in <paramref name="cache"/> has a non-null value
|
|
/// and a Good-quality <see cref="DataValueSnapshot.StatusCode"/>. A false here lets
|
|
/// callers short-circuit script evaluation — predicates that unconditionally cast
|
|
/// <c>ctx.GetTag(path).Value</c> to a numeric type would otherwise throw
|
|
/// <see cref="NullReferenceException"/> until the upstream subscription delivers
|
|
/// its first push.
|
|
/// </summary>
|
|
private static bool AreInputsReady(IReadOnlyDictionary<string, DataValueSnapshot> cache)
|
|
{
|
|
foreach (var kv in cache)
|
|
{
|
|
if (kv.Value is null || kv.Value.Value is null) return false;
|
|
// OPC UA Part 4 StatusCode: bit 31 = severity 10 (Bad). Treat Good + Uncertain
|
|
// as "ready"; Uncertain carries a value the script can still inspect + make a
|
|
// qualified decision from. Only outright Bad is skipped.
|
|
if ((kv.Value.StatusCode & 0x80000000u) != 0) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private void EmitEvent(AlarmState state, AlarmConditionState condition, EmissionKind kind)
|
|
{
|
|
// Suppressed kind means shelving ate the emission — we don't fire for subscribers
|
|
// but the state record still advanced so startup recovery reflects reality.
|
|
if (kind == EmissionKind.Suppressed || kind == EmissionKind.None) return;
|
|
|
|
var message = MessageTemplate.Resolve(state.Definition.MessageTemplate, TryLookup);
|
|
var evt = new ScriptedAlarmEvent(
|
|
AlarmId: state.Definition.AlarmId,
|
|
EquipmentPath: state.Definition.EquipmentPath,
|
|
AlarmName: state.Definition.AlarmName,
|
|
Kind: state.Definition.Kind,
|
|
Severity: state.Definition.Severity,
|
|
Message: message,
|
|
Condition: condition,
|
|
Emission: kind,
|
|
TimestampUtc: _clock());
|
|
try { OnEvent?.Invoke(this, evt); }
|
|
catch (Exception ex)
|
|
{
|
|
_engineLogger.Warning(ex, "ScriptedAlarmEngine OnEvent subscriber threw for {AlarmId}", state.Definition.AlarmId);
|
|
}
|
|
}
|
|
|
|
private DataValueSnapshot? TryLookup(string path)
|
|
=> _valueCache.TryGetValue(path, out var v) ? v : null;
|
|
|
|
private void RunShelvingCheck()
|
|
{
|
|
if (_disposed) return;
|
|
var ids = _alarms.Keys.ToArray();
|
|
_ = ShelvingCheckAsync(ids, CancellationToken.None);
|
|
}
|
|
|
|
private async Task ShelvingCheckAsync(IReadOnlyList<string> alarmIds, CancellationToken ct)
|
|
{
|
|
try
|
|
{
|
|
await _evalGate.WaitAsync(ct).ConfigureAwait(false);
|
|
try
|
|
{
|
|
var now = _clock();
|
|
foreach (var id in alarmIds)
|
|
{
|
|
if (!_alarms.TryGetValue(id, out var state)) continue;
|
|
var result = Part9StateMachine.ApplyShelvingCheck(state.Condition, now);
|
|
if (!ReferenceEquals(result.State, state.Condition))
|
|
{
|
|
_alarms[id] = state with { Condition = result.State };
|
|
await _store.SaveAsync(result.State, ct).ConfigureAwait(false);
|
|
if (result.Emission != EmissionKind.None)
|
|
EmitEvent(state, result.State, result.Emission);
|
|
}
|
|
}
|
|
}
|
|
finally { _evalGate.Release(); }
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_engineLogger.Warning(ex, "ScriptedAlarmEngine shelving-check failed");
|
|
}
|
|
}
|
|
|
|
private void UnsubscribeFromUpstream()
|
|
{
|
|
foreach (var s in _upstreamSubscriptions)
|
|
{
|
|
try { s.Dispose(); } catch { }
|
|
}
|
|
_upstreamSubscriptions.Clear();
|
|
}
|
|
|
|
private void EnsureLoaded()
|
|
{
|
|
if (!_loaded) throw new InvalidOperationException(
|
|
"ScriptedAlarmEngine not loaded. Call LoadAsync first.");
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
if (_disposed) return;
|
|
_disposed = true;
|
|
_shelvingTimer?.Dispose();
|
|
UnsubscribeFromUpstream();
|
|
_alarms.Clear();
|
|
_alarmsReferencing.Clear();
|
|
}
|
|
|
|
private sealed record AlarmState(
|
|
ScriptedAlarmDefinition Definition,
|
|
TimedScriptEvaluator<AlarmPredicateContext, bool> Evaluator,
|
|
IReadOnlySet<string> Inputs,
|
|
IReadOnlyList<string> TemplateTokens,
|
|
ILogger Logger,
|
|
AlarmConditionState Condition);
|
|
}
|
|
|
|
/// <summary>
|
|
/// One alarm emission the engine pushed to subscribers. Carries everything
|
|
/// downstream consumers (OPC UA alarm-source adapter + historian sink) need to
|
|
/// publish the event without re-querying the engine.
|
|
/// </summary>
|
|
public sealed record ScriptedAlarmEvent(
|
|
string AlarmId,
|
|
string EquipmentPath,
|
|
string AlarmName,
|
|
AlarmKind Kind,
|
|
AlarmSeverity Severity,
|
|
string Message,
|
|
AlarmConditionState Condition,
|
|
EmissionKind Emission,
|
|
DateTime TimestampUtc);
|
|
|
|
/// <summary>
|
|
/// Upstream source abstraction — intentionally identical shape to the virtual-tag
|
|
/// engine's so Stream G can compose them behind one driver bridge.
|
|
/// </summary>
|
|
public interface ITagUpstreamSource
|
|
{
|
|
DataValueSnapshot ReadTag(string path);
|
|
IDisposable SubscribeTag(string path, Action<string, DataValueSnapshot> observer);
|
|
}
|