diff --git a/code-reviews/Core.ScriptedAlarms/findings.md b/code-reviews/Core.ScriptedAlarms/findings.md index a7f097e..3c0ec70 100644 --- a/code-reviews/Core.ScriptedAlarms/findings.md +++ b/code-reviews/Core.ScriptedAlarms/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-05-22 | | Commit reviewed | `76d35d1` | | Status | Reviewed | -| Open findings | 6 | +| Open findings | 0 | ## Checklist coverage @@ -66,13 +66,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Documentation & comments | | Location | `ScriptedAlarmEngine.cs:343`, `docs/ScriptedAlarms.md:107` | -| Status | Open | +| Status | Resolved | **Description:** `docs/ScriptedAlarms.md` (Composition step 3) and the `OnUpstreamChange` comment ("Fire-and-forget so driver-side dispatch isn't blocked", line 225-226) describe the `OnEvent` emission path as non-blocking / fire-and-forget. In the code, `EmitEvent` invokes `OnEvent?.Invoke(this, evt)` **synchronously while `_evalGate` is held** (called from `EvaluatePredicateToStateAsync` line 305 and `ApplyAsync` line 217, both inside the gate). A slow subscriber blocks the single evaluation gate for all alarms; a subscriber that re-enters the engine (e.g. calls `AcknowledgeAsync`) deadlocks because `_evalGate` is a non-reentrant `SemaphoreSlim(1,1)`. The behaviour is defensible (the historian sink is non-blocking, per the doc), but the comments/doc are misleading about where the work happens and the re-entrancy hazard is undocumented. **Recommendation:** Either move `EmitEvent` outside the `_evalGate` critical section (collect emissions during the locked section and raise them after `Release()`), or document explicitly on `OnEvent` that handlers run under the engine lock, must be fast, and must never call back into the engine. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — split `EmitEvent` into `BuildEmission` (called under the gate to capture a coherent value-cache snapshot for message-template resolution) and `FireEvent` (called after `_evalGate.Release()` so subscribers can re-enter the engine without deadlocking and a slow subscriber no longer blocks concurrent engine operations). Updated `ApplyAsync`, `ReevaluateAsync`, `ShelvingCheckAsync`, and `LoadAsync` (startup-recovery path) to collect emissions in a pending list and flush after the gate is released; added regression tests for both the re-entry path and a white-box gate-acquirable-from-subscriber check. ### Core.ScriptedAlarms-004 @@ -111,13 +111,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Concurrency & thread safety | | Location | `ScriptedAlarmEngine.cs:232`, `ScriptedAlarmEngine.cs:369` | -| Status | Open | +| Status | Resolved | **Description:** `OnUpstreamChange` and `RunShelvingCheck` both launch fire-and-forget tasks (`_ = ReevaluateAsync(...)`, `_ = ShelvingCheckAsync(...)`) with `CancellationToken.None`. There is no tracking of these in-flight tasks, so `Dispose` cannot await them and a server shutdown can race a still-running re-evaluation that writes to the (possibly disposed) store. Combined with finding 005, an upstream push arriving during shutdown produces an unobserved background task touching torn state. **Recommendation:** Track outstanding background tasks (or use a single serialised worker / `Channel`), and link them to a `CancellationTokenSource` that `Dispose` cancels and drains. At minimum, await the in-flight work in `Dispose`. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — added `_inFlight` HashSet + `TrackBackgroundTask(...)` helper to register every fire-and-forget `ReevaluateAsync`/`ShelvingCheckAsync` task, with a sync `ContinueWith` continuation that auto-removes on completion. `Dispose` snapshots the set under its own lock and `Task.WhenAll(...).GetAwaiter().GetResult()` drains them before returning; `OnUpstreamChange` also short-circuits when `_disposed` is set so no new work is queued during shutdown. Regression test exercises the slow-store path: Dispose blocks until the in-flight `SaveAsync` completes. ### Core.ScriptedAlarms-007 @@ -141,13 +141,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Performance & resource management | | Location | `Part9StateMachine.cs:261-268` | -| Status | Open | +| Status | Resolved | **Description:** `AppendComment` copies the entire existing comment list into a new `List` on every audit-producing transition (ack, confirm, shelve, unshelve, enable, disable, add-comment, auto-unshelve). The `Comments` list is append-only and unbounded — for a long-lived alarm that is acknowledged/commented hundreds of times, every transition is an O(n) copy and the full history is also re-serialised to the store on every `SaveAsync`. Over a multi-month uptime this is a slowly growing per-transition cost. **Recommendation:** Acceptable for now given audit requirements, but consider an immutable persistent list / `ImmutableList` to make append O(log n), or have the store persist comments incrementally (append-only audit table) rather than rewriting the whole collection each save. At minimum, note the unbounded-growth characteristic in the design doc. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — switched `AlarmConditionState.Comments` from `IReadOnlyList` to `ImmutableList` and rewrote `AppendComment` as `existing.Add(...)` so each append is O(log n) instead of the prior O(n) copy. `ImmutableList` still implements `IReadOnlyList` so existing consumers compile unchanged; the persistence layer continues to store comments as JSON so wire-format is unaffected. Regression test asserts the runtime type is `ImmutableList`. ### Core.ScriptedAlarms-009 @@ -156,13 +156,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Performance & resource management | | Location | `ScriptedAlarmEngine.cs:309-315`, `ScriptedAlarmEngine.cs:271` | -| Status | Open | +| Status | Won't Fix | **Description:** `BuildReadCache` allocates a fresh `Dictionary` on every predicate evaluation, i.e. on every upstream tag change for every referencing alarm. On a busy line where many tags feeding many alarms change frequently, this is a steady stream of short-lived dictionary allocations on the hot path. `AlarmPredicateContext` is also newly constructed each evaluation (line 281). **Recommendation:** Minor. If the evaluation path shows up in allocation profiling, the read cache could be a reused per-alarm buffer cleared between evaluations (evaluations are already serialised under `_evalGate`, so a single shared scratch dictionary is safe). Not worth doing speculatively — flag for the perf surface in `docs/v2/Galaxy.Performance.md` if alarm evaluation is ever soak-tested. -**Resolution:** _(open)_ +**Resolution:** Won't Fix 2026-05-23 — per the recommendation, no code change. Documented the known allocation characteristic in `docs/v2/Galaxy.Performance.md` (new "Scripted-alarm engine — known hot-path allocations" section) so a future soak that surfaces pressure has a noted mitigation (reused per-alarm scratch buffer) and we don't re-find this in a later review. ### Core.ScriptedAlarms-010 @@ -171,13 +171,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Design-document adherence | | Location | `ScriptedAlarmEngine.cs:325-336`, `AlarmPredicateContext.cs:33-40`, `MessageTemplate.cs:47` | -| Status | Open | +| Status | Resolved | **Description:** Quality handling is inconsistent across the three places that inspect a `DataValueSnapshot.StatusCode`. `AreInputsReady` (engine, line 333) treats only outright Bad (bit 31) as not-ready, so an Uncertain-quality input is fed to the predicate. `MessageTemplate.Resolve` (line 47) rejects *any* non-zero status code — including Uncertain — and renders `{?}`. `AlarmPredicateContext.GetTag` returns `BadNodeIdUnknown` (`0x80340000`) for a missing path. The net effect: an Uncertain-quality tag is considered good enough to drive an alarm *activation* decision but not good enough to print in the alarm *message*. `docs/ScriptedAlarms.md` ("Fallback rules") only documents the message-template behaviour and does not mention that predicate evaluation accepts Uncertain. The two policies should be reconciled and documented. **Recommendation:** Decide one quality policy for "is this input usable" and apply it in both `AreInputsReady` and the message resolver, or explicitly document why predicate evaluation and message rendering treat Uncertain differently. Add the predicate-side Uncertain rule to `docs/ScriptedAlarms.md`. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — documented the deliberate asymmetry. Added an "Input-quality policy" section to `docs/ScriptedAlarms.md` (table contrasting `AreInputsReady`'s Bad-only rejection with `MessageTemplate.Resolve`'s Good-only acceptance, plus the rationale) and a cross-referencing remarks block on `MessageTemplate.Resolve`. The two policies are kept distinct on purpose: predicate evaluation accepts Uncertain because the value is still inspectable, while the operator-facing message must render `{?}` to make the qualifier visible. Regression test locks in both behaviours with a single Uncertain-quality input that activates the alarm and surfaces `{?}` in the emission message. ### Core.ScriptedAlarms-011 @@ -186,13 +186,13 @@ a category produced nothing rather than leaving it blank. | Severity | Low | | Category | Code organization & conventions | | Location | `Part9StateMachine.cs:275` | -| Status | Open | +| Status | Resolved | **Description:** `TransitionResult.NoOp(state, reason)` takes a `reason` string parameter that is documented in the calling code as a diagnostic ("disabled — predicate result ignored", "already acknowledged", etc.) but the factory method silently discards it — it just returns `new(state, EmissionKind.None)`, identical to `None(state)`. Every call site that passes a carefully-worded reason string is doing dead work, and the comments in `Part9StateMachine` and the class-level remarks claim disabled/no-op transitions "produce ... a diagnostic log line", which they do not. **Recommendation:** Either propagate the reason (add it to `TransitionResult` and have the engine log it at debug level when emission is `None` for a no-op), or remove the unused `reason` parameter and collapse `NoOp` into `None`. Update the `Part9StateMachine` remarks that promise a diagnostic log line. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-23 — added a nullable `NoOpReason` property to `TransitionResult` (defaulted on the primary constructor so existing positional `new TransitionResult(state, kind)` call sites remain valid) and propagated it from `TransitionResult.NoOp(state, reason)`. `ScriptedAlarmEngine.ApplyAsync` now logs the reason at debug level via the alarm's script logger when the transition is a no-op, fulfilling the class-level remarks. Two regression tests assert that `NoOp` carries the reason and `None` does not. ### Core.ScriptedAlarms-012 diff --git a/docs/ScriptedAlarms.md b/docs/ScriptedAlarms.md index 63fe2d1..0a2bea9 100644 --- a/docs/ScriptedAlarms.md +++ b/docs/ScriptedAlarms.md @@ -35,7 +35,7 @@ new ScriptedAlarmDefinition( ## Predicate evaluation -Alarm predicates reuse the same Roslyn sandbox as virtual tags — `ScriptEvaluator` compiles the source, `TimedScriptEvaluator` wraps it with the configured timeout (default from `TimedScriptEvaluator.DefaultTimeout`), and `DependencyExtractor` statically harvests the tag paths the script reads. The sandbox rules (forbidden types, cancellation, logging sinks) are documented in [VirtualTags.md](VirtualTags.md); ScriptedAlarms does not redefine them. The known memory / CPU resource limits are documented there as well. +Alarm predicates reuse the same Roslyn sandbox as virtual tags — `ScriptEvaluator` compiles the source, `TimedScriptEvaluator` wraps it with the configured timeout (default from `TimedScriptEvaluator.DefaultTimeout`), and `DependencyExtractor` statically harvests the tag paths the script reads. The sandbox rules (forbidden types, cancellation, logging sinks) are documented in [VirtualTags.md](VirtualTags.md); ScriptedAlarms does not redefine them. The known resource limits — unbounded script-side memory, the per-publish accretion of dynamically-emitted script assemblies (Core.Scripting-008), and the orphan-thread CPU-budget caveat — are documented in that file as well. `AlarmPredicateContext` (`AlarmPredicateContext.cs`) is the script's `ScriptContext` subclass: @@ -79,6 +79,17 @@ Two invariants the machine enforces: Fallback rules: a resolved `DataValueSnapshot` with a non-zero `StatusCode`, a `null` `Value`, or an unknown path becomes `{?}`. The event still fires — the operator sees where the reference broke rather than having the alarm swallowed. +## Input-quality policy + +Predicate evaluation and message-template resolution deliberately treat tag-input quality differently: + +| Surface | Quality bar | Rationale | +|---|---|---| +| `ScriptedAlarmEngine.AreInputsReady` (predicate gate) | **Bad rejected** (`StatusCode` bit 31 set). `Good` and `Uncertain` are both accepted. | Uncertain quality still carries a value the predicate can inspect; rejecting it would mask a transitional alarm condition. Predicate evaluation is a state-machine input — operators want it to track reality as closely as the quality allows. | +| `MessageTemplate.Resolve` (operator-facing message) | **Any non-zero `StatusCode` rejected** — only `Good` substitutes; `Uncertain` / Bad / unknown all render as `{?}`. | The message is a human-readable signal; substituting an Uncertain value would let operators act on a questionable reading without seeing the qualifier. Rendering `{?}` makes the doubt explicit. | + +`AlarmPredicateContext.GetTag` returns a `BadNodeIdUnknown` (`0x80340000`) snapshot for missing or empty paths, so a typo in the predicate flows through `AreInputsReady` (Bad → predicate skipped, prior state held) and `MessageTemplate.Resolve` (non-Good → `{?}`) without crashing the engine. (Core.ScriptedAlarms-010) + ## State persistence `IAlarmStateStore` (`IAlarmStateStore.cs`) is the persistence contract: `LoadAsync(alarmId)`, `LoadAllAsync`, `SaveAsync(state)`, `RemoveAsync(alarmId)`. `InMemoryAlarmStateStore` in the same file is the default for tests and dev deployments without a SQL backend. Stream E wires the production implementation against the `ScriptedAlarmState` config-DB table with audit logging through `Core.Abstractions.IAuditLogger`. diff --git a/docs/v2/Galaxy.Performance.md b/docs/v2/Galaxy.Performance.md index 33ef145..01bbb3e 100644 --- a/docs/v2/Galaxy.Performance.md +++ b/docs/v2/Galaxy.Performance.md @@ -150,3 +150,9 @@ substantive driver change, and revise this table when the data does. leak guard. Likely culprits: lingering subscription handles in `SubscriptionRegistry`, or a downstream consumer retaining `DataValueSnapshot` references past their useful life. + +## Scripted-alarm engine — known hot-path allocations + +`ScriptedAlarmEngine.BuildReadCache` allocates a fresh `Dictionary` and `AlarmPredicateContext` on every predicate evaluation — i.e. once per upstream tag change per referencing alarm. On a busy line where many tags feeding many alarms change frequently, this is a steady stream of short-lived dictionary allocations on the hot path. (Core.ScriptedAlarms-009) + +The allocations are deliberate for now: predicate evaluation is already serialised under `_evalGate`, so a single reused scratch dictionary would be safe, but the per-call dictionary keeps the evaluation surface immutable and trivially safe against future refactors. If a future scripted-alarm soak surfaces allocation pressure on this path, the mitigation is a per-alarm scratch buffer cleared between evaluations — note here before changing the engine. diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/AlarmConditionState.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/AlarmConditionState.cs index 6d466b9..241e387 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/AlarmConditionState.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/AlarmConditionState.cs @@ -1,3 +1,5 @@ +using System.Collections.Immutable; + namespace ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms; /// @@ -17,7 +19,10 @@ namespace ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms; /// /// is append-only; comments + ack/confirm user identities /// are the audit surface regulators consume. The engine never rewrites past -/// entries. +/// entries. The runtime type is so +/// each append is O(log n) rather than the O(n) copy a plain +/// IReadOnlyList<AlarmComment> would force on every audit-producing +/// transition. (Core.ScriptedAlarms-008) /// /// public sealed record AlarmConditionState( @@ -36,7 +41,7 @@ public sealed record AlarmConditionState( DateTime? LastConfirmUtc, string? LastConfirmUser, string? LastConfirmComment, - IReadOnlyList Comments) + ImmutableList Comments) { /// Initial-load state for a newly registered alarm — everything in the "no-event" position. public static AlarmConditionState Fresh(string alarmId, DateTime nowUtc) => new( @@ -55,7 +60,7 @@ public sealed record AlarmConditionState( LastConfirmUtc: null, LastConfirmUser: null, LastConfirmComment: null, - Comments: []); + Comments: ImmutableList.Empty); } /// diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/MessageTemplate.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/MessageTemplate.cs index fff97a7..6083b9b 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/MessageTemplate.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/MessageTemplate.cs @@ -33,6 +33,16 @@ public static class MessageTemplate /// has a non-Good or a null /// resolve to {?}. /// + /// + /// Quality bar is intentionally stricter than predicate evaluation: + /// only Good (StatusCode == 0) is substituted; Uncertain renders as + /// {?}. The predicate gate (ScriptedAlarmEngine.AreInputsReady) + /// accepts Uncertain because it still carries a value the predicate can + /// inspect, but the operator-facing message must make doubt explicit rather + /// than substituting a value an operator might act on. See the + /// "Input-quality policy" section in docs/ScriptedAlarms.md. + /// (Core.ScriptedAlarms-010) + /// public static string Resolve(string template, Func resolveTag) { if (string.IsNullOrEmpty(template)) return template ?? string.Empty; diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/Part9StateMachine.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/Part9StateMachine.cs index b27abcb..886d6a5 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/Part9StateMachine.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/Part9StateMachine.cs @@ -1,3 +1,5 @@ +using System.Collections.Immutable; + namespace ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms; /// @@ -258,21 +260,33 @@ public static class Part9StateMachine return s.UnshelveAtUtc is DateTime t && nowUtc >= t ? ShelvingState.Unshelved : s; } - private static IReadOnlyList AppendComment( - IReadOnlyList existing, DateTime ts, string user, string kind, string? text) - { - var list = new List(existing.Count + 1); - list.AddRange(existing); - list.Add(new AlarmComment(ts, user, kind, text ?? string.Empty)); - return list; - } + private static ImmutableList AppendComment( + ImmutableList existing, DateTime ts, string user, string kind, string? text) + => existing.Add(new AlarmComment(ts, user, kind, text ?? string.Empty)); } /// Result of a state-machine operation — new state + what to emit (if anything). -public sealed record TransitionResult(AlarmConditionState State, EmissionKind Emission) +/// +/// +/// carries a short diagnostic string for the +/// case (e.g. +/// "disabled — predicate result ignored", "already acknowledged"). The +/// engine logs this at debug level when a no-op result is observed, so +/// the class-level remarks on hold: +/// disabled-alarm and idempotent ack/confirm/shelve/unshelve +/// transitions do produce a diagnostic log line. Plain +/// results (state unchanged, +/// no operator intent recorded — e.g. a predicate re-evaluation that +/// confirms the existing active state) leave +/// null because there is nothing to surface to an operator. +/// (Core.ScriptedAlarms-011) +/// +/// +public sealed record TransitionResult(AlarmConditionState State, EmissionKind Emission, string? NoOpReason = null) { public static TransitionResult None(AlarmConditionState state) => new(state, EmissionKind.None); - public static TransitionResult NoOp(AlarmConditionState state, string reason) => new(state, EmissionKind.None); + public static TransitionResult NoOp(AlarmConditionState state, string reason) + => new(state, EmissionKind.None, reason); } /// What kind of event, if any, the engine should emit after a transition. diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs index 9689f8d..32fa082 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs @@ -59,6 +59,15 @@ public sealed class ScriptedAlarmEngine : IDisposable private bool _loaded; private bool _disposed; + // Tracks fire-and-forget background work launched by OnUpstreamChange + // (ReevaluateAsync) and RunShelvingCheck (ShelvingCheckAsync). Dispose drains + // these so a re-evaluation in flight when shutdown begins finishes its + // SaveAsync before the engine returns control to the caller. The HashSet is + // accessed under its own lock — never under _evalGate — so registration / + // unregistration cannot deadlock against the gate. (Core.ScriptedAlarms-006) + private readonly HashSet _inFlight = []; + private readonly object _inFlightLock = new(); + public ScriptedAlarmEngine( ITagUpstreamSource upstream, IAlarmStateStore store, @@ -92,6 +101,7 @@ public sealed class ScriptedAlarmEngine : IDisposable if (_disposed) throw new ObjectDisposedException(nameof(ScriptedAlarmEngine)); if (definitions is null) throw new ArgumentNullException(nameof(definitions)); + var pending = new List(0); await _evalGate.WaitAsync(ct).ConfigureAwait(false); try { @@ -157,11 +167,14 @@ public sealed class ScriptedAlarmEngine : IDisposable // Restore persisted state, falling back to Fresh where nothing was saved, // then re-derive ActiveState from the current predicate per decision #14. + // Any predicate emissions queue into `pending` and fire after the gate + // is released — so a startup-recovery activation event can call back into + // the engine without deadlocking. (Core.ScriptedAlarms-003) foreach (var (alarmId, state) in _alarms) { var persisted = await _store.LoadAsync(alarmId, ct).ConfigureAwait(false); var seed = persisted ?? state.Condition; - var afterPredicate = await EvaluatePredicateToStateAsync(state, seed, nowUtc: _clock(), ct) + var afterPredicate = await EvaluatePredicateToStateAsync(state, seed, nowUtc: _clock(), ct, pending) .ConfigureAwait(false); _alarms[alarmId] = state with { Condition = afterPredicate }; await _store.SaveAsync(afterPredicate, ct).ConfigureAwait(false); @@ -192,6 +205,10 @@ public sealed class ScriptedAlarmEngine : IDisposable { _evalGate.Release(); } + + // Fire any emissions collected during startup recovery OUTSIDE the gate so + // subscribers can re-enter the engine safely. (Core.ScriptedAlarms-003) + foreach (var evt in pending) FireEvent(evt); } /// @@ -234,6 +251,7 @@ public sealed class ScriptedAlarmEngine : IDisposable if (!_alarms.TryGetValue(alarmId, out var state)) throw new ArgumentException($"Unknown alarm {alarmId}", nameof(alarmId)); + ScriptedAlarmEvent? pending = null; await _evalGate.WaitAsync(ct).ConfigureAwait(false); try { @@ -244,27 +262,50 @@ public sealed class ScriptedAlarmEngine : IDisposable // the exception propagates to the caller. (Core.ScriptedAlarms-007) await _store.SaveAsync(result.State, ct).ConfigureAwait(false); _alarms[alarmId] = state with { Condition = result.State }; - if (result.Emission != EmissionKind.None) EmitEvent(state, result.State, result.Emission); + // Build the emission event under the gate (it captures a coherent + // snapshot of state + message-template values) but defer the actual + // OnEvent dispatch until after Release() so a slow subscriber or a + // subscriber that re-enters the engine doesn't block / deadlock. + // (Core.ScriptedAlarms-003) + if (result.Emission != EmissionKind.None) + pending = BuildEmission(state, result.State, result.Emission); + else if (result.NoOpReason is { } reason) + { + // The Part9StateMachine remarks promise a diagnostic log line for + // disabled-alarm no-ops + idempotent ack/confirm/shelve/unshelve + // calls. We surface them at debug so they're available when + // investigating "why didn't my ack take effect?" without spamming + // the main info log. (Core.ScriptedAlarms-011) + state.Logger.Debug("Alarm {AlarmId} no-op transition: {Reason}", alarmId, reason); + } } finally { _evalGate.Release(); } + + // OnEvent dispatch happens OUTSIDE _evalGate so subscribers can call back + // into the engine (e.g. AcknowledgeAsync from inside an Activated handler) + // without deadlocking against the non-reentrant SemaphoreSlim. + if (pending is not null) FireEvent(pending); } /// /// Upstream-change callback. Updates the value cache + enqueues predicate /// re-evaluation for every alarm referencing the changed path. Fire-and-forget - /// so driver-side dispatch isn't blocked. + /// so driver-side dispatch isn't blocked; the background task is tracked so + /// can drain it. (Core.ScriptedAlarms-006) /// internal void OnUpstreamChange(string path, DataValueSnapshot value) { _valueCache[path] = value; + if (_disposed) return; // don't queue new work against a disposing engine if (_alarmsReferencing.TryGetValue(path, out var alarmIds)) { - _ = ReevaluateAsync(alarmIds.ToArray(), CancellationToken.None); + TrackBackgroundTask(ReevaluateAsync(alarmIds.ToArray(), CancellationToken.None)); } } private async Task ReevaluateAsync(IReadOnlyList alarmIds, CancellationToken ct) { + var pending = new List(0); try { await _evalGate.WaitAsync(ct).ConfigureAwait(false); @@ -280,7 +321,7 @@ public sealed class ScriptedAlarmEngine : IDisposable { if (!_alarms.TryGetValue(id, out var state)) continue; var newState = await EvaluatePredicateToStateAsync( - state, state.Condition, _clock(), ct).ConfigureAwait(false); + state, state.Condition, _clock(), ct, pending).ConfigureAwait(false); if (!ReferenceEquals(newState, state.Condition)) { // Persist before updating in-memory so a store failure leaves @@ -295,16 +336,23 @@ public sealed class ScriptedAlarmEngine : IDisposable catch (Exception ex) { _engineLogger.Error(ex, "ScriptedAlarmEngine reevaluate failed"); + return; } + // Fire emissions OUTSIDE _evalGate so subscriber callbacks can re-enter + // the engine without deadlocking. (Core.ScriptedAlarms-003) + foreach (var evt in pending) FireEvent(evt); } /// /// Evaluate the predicate + apply the resulting state-machine transition. - /// Returns the new condition state. Emits the appropriate event if the - /// transition produces one. + /// Returns the new condition state. If the transition produces an emission, + /// appends it to so the caller can fire + /// them after releasing _evalGate — keeping subscriber callbacks + /// outside the gate. (Core.ScriptedAlarms-003) /// private async Task EvaluatePredicateToStateAsync( - AlarmState state, AlarmConditionState seed, DateTime nowUtc, CancellationToken ct) + AlarmState state, AlarmConditionState seed, DateTime nowUtc, CancellationToken ct, + List? pendingEmissions = null) { var inputs = BuildReadCache(state.Inputs); @@ -340,7 +388,14 @@ public sealed class ScriptedAlarmEngine : IDisposable var result = Part9StateMachine.ApplyPredicate(seed, predicateTrue, nowUtc); if (result.Emission != EmissionKind.None) - EmitEvent(state, result.State, result.Emission); + { + var evt = BuildEmission(state, result.State, result.Emission); + if (evt is not null) + { + if (pendingEmissions is not null) pendingEmissions.Add(evt); + else FireEvent(evt); // LoadAsync path: no caller-supplied list, fire here. + } + } return result.State; } @@ -373,14 +428,24 @@ public sealed class ScriptedAlarmEngine : IDisposable return true; } - private void EmitEvent(AlarmState state, AlarmConditionState condition, EmissionKind kind) + /// + /// Build (but do not fire) the for a + /// transition. Returns null for kinds that should not be published + /// ( and + /// ). Pure construction — called under + /// _evalGate so the message-template resolution uses a coherent + /// value-cache snapshot. The actual dispatch is + /// done by AFTER the gate is + /// released. (Core.ScriptedAlarms-003) + /// + private ScriptedAlarmEvent? BuildEmission(AlarmState state, AlarmConditionState condition, EmissionKind kind) { // Suppressed kind means shelving ate the emission — we don't fire for subscribers // but the state record still advanced so startup recovery reflects reality. - if (kind == EmissionKind.Suppressed || kind == EmissionKind.None) return; + if (kind == EmissionKind.Suppressed || kind == EmissionKind.None) return null; var message = MessageTemplate.Resolve(state.Definition.MessageTemplate, TryLookup); - var evt = new ScriptedAlarmEvent( + return new ScriptedAlarmEvent( AlarmId: state.Definition.AlarmId, EquipmentPath: state.Definition.EquipmentPath, AlarmName: state.Definition.AlarmName, @@ -390,10 +455,22 @@ public sealed class ScriptedAlarmEngine : IDisposable Condition: condition, Emission: kind, TimestampUtc: _clock()); + } + + /// + /// Invoke the handler for a built emission. Must be + /// called OUTSIDE _evalGate: a slow subscriber would otherwise + /// block the gate for every other engine operation, and a subscriber + /// that re-enters the engine (e.g. calls AcknowledgeAsync) would + /// deadlock against the non-reentrant SemaphoreSlim. + /// (Core.ScriptedAlarms-003) + /// + private void FireEvent(ScriptedAlarmEvent evt) + { try { OnEvent?.Invoke(this, evt); } catch (Exception ex) { - _engineLogger.Warning(ex, "ScriptedAlarmEngine OnEvent subscriber threw for {AlarmId}", state.Definition.AlarmId); + _engineLogger.Warning(ex, "ScriptedAlarmEngine OnEvent subscriber threw for {AlarmId}", evt.AlarmId); } } @@ -404,7 +481,24 @@ public sealed class ScriptedAlarmEngine : IDisposable { if (_disposed) return; var ids = _alarms.Keys.ToArray(); - _ = ShelvingCheckAsync(ids, CancellationToken.None); + TrackBackgroundTask(ShelvingCheckAsync(ids, CancellationToken.None)); + } + + /// + /// Register a fire-and-forget task so can await it. + /// The task removes itself from the set on completion via a continuation. + /// (Core.ScriptedAlarms-006) + /// + private void TrackBackgroundTask(Task task) + { + lock (_inFlightLock) { _inFlight.Add(task); } + // Use ContinueWith with ExecuteSynchronously so the removal runs on the + // completing thread — avoids scheduler delay between completion and + // unregistration that would otherwise let Dispose see a stale set. + task.ContinueWith(t => + { + lock (_inFlightLock) { _inFlight.Remove(t); } + }, CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default); } /// @@ -416,6 +510,7 @@ public sealed class ScriptedAlarmEngine : IDisposable private async Task ShelvingCheckAsync(IReadOnlyList alarmIds, CancellationToken ct) { + var pending = new List(0); try { await _evalGate.WaitAsync(ct).ConfigureAwait(false); @@ -440,7 +535,10 @@ public sealed class ScriptedAlarmEngine : IDisposable await _store.SaveAsync(result.State, ct).ConfigureAwait(false); _alarms[id] = state with { Condition = result.State }; if (result.Emission != EmissionKind.None) - EmitEvent(state, result.State, result.Emission); + { + var evt = BuildEmission(state, result.State, result.Emission); + if (evt is not null) pending.Add(evt); + } } } } @@ -449,7 +547,10 @@ public sealed class ScriptedAlarmEngine : IDisposable catch (Exception ex) { _engineLogger.Warning(ex, "ScriptedAlarmEngine shelving-check failed"); + return; } + // Fire emissions OUTSIDE _evalGate. (Core.ScriptedAlarms-003) + foreach (var evt in pending) FireEvent(evt); } private void UnsubscribeFromUpstream() @@ -473,6 +574,28 @@ public sealed class ScriptedAlarmEngine : IDisposable _disposed = true; _shelvingTimer?.Dispose(); UnsubscribeFromUpstream(); + + // Drain any fire-and-forget background work (ReevaluateAsync from + // OnUpstreamChange + ShelvingCheckAsync from the 5s timer) that started + // before _disposed = true was visible. Without this, a SaveAsync in + // flight can outlive the engine and write to a (possibly disposed) store + // after Dispose() has returned. The tasks re-check _disposed after + // acquiring the gate and bail out, but the await still has to complete. + // (Core.ScriptedAlarms-006) + Task[] toAwait; + lock (_inFlightLock) { toAwait = [.. _inFlight]; } + if (toAwait.Length > 0) + { + try { Task.WhenAll(toAwait).GetAwaiter().GetResult(); } + catch (Exception ex) + { + // Background task failures already logged inside ReevaluateAsync / + // ShelvingCheckAsync; surface here at debug so a parent shutdown is + // not noisy. The key invariant is that the tasks have COMPLETED. + _engineLogger.Debug(ex, "ScriptedAlarmEngine background task threw during shutdown drain"); + } + } + // Do NOT clear _alarms here: Timer.Dispose() does not wait for in-flight callbacks, // so a ShelvingCheckAsync or ReevaluateAsync can still be running inside _evalGate. // Those paths now re-check _disposed after acquiring the gate and bail out safely. diff --git a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs index b2a4fdf..5c761cb 100644 --- a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs +++ b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs @@ -606,6 +606,253 @@ public sealed class ScriptedAlarmEngineTests "Uncertain-quality inputs are treated as ready — predicate evaluates"); } + // ------------------------------------------------------------------------- + // Core.ScriptedAlarms-003: OnEvent emission must not block under _evalGate. + // (1) A slow subscriber must not block the gate for other alarms. + // (2) A subscriber that re-enters the engine (e.g. AcknowledgeAsync) must + // not deadlock against _evalGate. Both regressions are covered here. + // ------------------------------------------------------------------------- + [Fact] + public async Task OnEvent_subscriber_can_call_back_into_engine_without_deadlock(/* -003 */) + { + // Re-entrancy regression. When OnEvent emission was inside _evalGate, a + // subscriber that called an engine method (e.g. AcknowledgeAsync) hung + // forever because the non-reentrant SemaphoreSlim refused to re-grant + // the gate the dispatch path was still holding. After the fix, emission + // happens AFTER Release() so the subscriber's call acquires the gate + // cleanly and the operator-driven action completes. + var up = new FakeUpstream(); + up.Set("Temp", 50); + var eng = Build(up, out _); + try + { + await eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")], + TestContext.Current.CancellationToken); + + // Subscriber re-enters the engine via Task.Run so the OnEvent + // dispatch thread is not blocked while waiting. Either way, with + // the fix in place AcknowledgeAsync must acquire _evalGate (the + // dispatch path released it before invoking the subscriber) and + // complete in well under the timeout. + var ackDone = new TaskCompletionSource(); + eng.OnEvent += (_, e) => + { + if (e.Emission != EmissionKind.Activated) return; + _ = Task.Run(async () => + { + try + { + await eng.AcknowledgeAsync(e.AlarmId, "sub", null, CancellationToken.None); + ackDone.TrySetResult(); + } + catch (Exception ex) { ackDone.TrySetException(ex); } + }); + }; + + up.Push("Temp", 150); + + var winner = await Task.WhenAny(ackDone.Task, Task.Delay(TimeSpan.FromSeconds(3))); + winner.ShouldBe(ackDone.Task, + "subscriber re-entering the engine must not deadlock against _evalGate"); + await ackDone.Task; // surface any inner exception + eng.GetState("HighTemp")!.Acked.ShouldBe(AlarmAckedState.Acknowledged); + } + finally + { + eng.Dispose(); + } + } + + [Fact] + public void OnEvent_emission_happens_outside_evalGate(/* -003 */) + { + // Direct white-box check on the gate-release ordering: AcknowledgeAsync + // emits the Acknowledged event AFTER releasing the gate. We assert that + // by observing the gate is acquirable from inside the subscriber. + // SemaphoreSlim.Wait(0) returns true only if the count > 0 (gate free). + var up = new FakeUpstream(); + up.Set("Temp", 50); + var eng = Build(up, out _); + try + { + eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")], + TestContext.Current.CancellationToken).GetAwaiter().GetResult(); + // Drive to Active so Acknowledge has something to ack. + up.Push("Temp", 150); + // Use the same WaitForAsync that other tests use — synchronously + // here since this is a non-async test. + for (var i = 0; i < 80 && eng.GetState("HighTemp")!.Active != AlarmActiveState.Active; i++) + Thread.Sleep(25); + eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active); + + // Use reflection to peek at _evalGate so the subscriber can probe it. + var gateField = typeof(ScriptedAlarmEngine).GetField( + "_evalGate", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + gateField.ShouldNotBeNull(); + var gate = (SemaphoreSlim)gateField.GetValue(eng)!; + + var gateFreeInsideEmission = false; + eng.OnEvent += (_, e) => + { + if (e.Emission != EmissionKind.Acknowledged) return; + // SemaphoreSlim.Wait(0) — non-blocking try-take. If the gate is + // free we acquire it (count back to 0); release immediately. + if (gate.Wait(0)) + { + gateFreeInsideEmission = true; + gate.Release(); + } + }; + + eng.AcknowledgeAsync("HighTemp", "alice", null, CancellationToken.None) + .GetAwaiter().GetResult(); + + gateFreeInsideEmission.ShouldBeTrue( + "_evalGate must be released before OnEvent fires so subscribers " + + "can call back into the engine without deadlocking"); + } + finally + { + eng.Dispose(); + } + } + + // ------------------------------------------------------------------------- + // Core.ScriptedAlarms-006: Dispose must drain in-flight background tasks + // launched by OnUpstreamChange / RunShelvingCheck. Otherwise a re-evaluation + // or shelving check started just before Dispose can keep running and write + // to a (possibly disposed) store after the engine has returned. + // ------------------------------------------------------------------------- + [Fact] + public async Task Dispose_drains_in_flight_reevaluation_tasks(/* -006 */) + { + var up = new FakeUpstream(); + up.Set("Temp", 50); + var logger = new LoggerConfiguration().CreateLogger(); + var slowStore = new BlockingSaveAlarmStateStore(); + var eng = new ScriptedAlarmEngine(up, slowStore, new ScriptLoggerFactory(logger), logger); + await eng.LoadAsync([Alarm("A", """return (int)ctx.GetTag("Temp").Value > 100;""")], + TestContext.Current.CancellationToken); + + // Block the NEXT save (the one triggered by the push below). + var saveGate = new TaskCompletionSource(); + slowStore.BlockNextSave = saveGate; + + // Trigger a re-evaluation that will go inside _evalGate and call SaveAsync. + up.Push("Temp", 150); + + // Wait until the store's SaveAsync is actually blocked. + await WaitForAsync(() => slowStore.SaveInProgress, timeoutMs: 1000); + + // Dispose must wait for the in-flight reevaluation to complete rather + // than returning while a background task still runs. + var disposeTask = Task.Run(() => eng.Dispose()); + + // Verify Dispose does NOT complete immediately — it should block waiting + // for the in-flight task. Without the -006 fix Dispose returns straight + // away and the background reevaluation can outlive the engine. + var prematureFinish = await Task.WhenAny(disposeTask, Task.Delay(200)); + prematureFinish.ShouldNotBe(disposeTask, + "Dispose must block until in-flight background tasks complete"); + + // Let the save complete and verify Dispose then returns. + saveGate.SetResult(); + await disposeTask.WaitAsync(TimeSpan.FromSeconds(3), TestContext.Current.CancellationToken); + slowStore.SaveInProgress.ShouldBeFalse("background task drained before Dispose returned"); + } + + // ------------------------------------------------------------------------- + // Core.ScriptedAlarms-010: predicate evaluation and message-template + // resolution apply different quality bars on purpose. Predicate evaluation + // accepts Uncertain (the predicate can still inspect the value); message + // resolution renders Uncertain as "{?}" so the operator sees the doubt + // explicitly. The two policies are documented in docs/ScriptedAlarms.md. + // ------------------------------------------------------------------------- + [Fact] + public async Task Uncertain_quality_drives_predicate_but_renders_question_mark_in_message(/* -010 */) + { + var up = new FakeUpstream(); + // Seed with Uncertain quality (severity bit 30 set, bit 31 clear). + up.Set("Temp", 150, statusCode: 0x40000000u); + using var eng = Build(up, out _); + await eng.LoadAsync([ + new ScriptedAlarmDefinition( + "HighTemp", "Plant/Line1", "HighTemp", + AlarmKind.LimitAlarm, AlarmSeverity.High, + "Temp {Temp} exceeded limit", + """return (int)ctx.GetTag("Temp").Value > 100;"""), + ], TestContext.Current.CancellationToken); + + // Predicate evaluated (Uncertain treated as ready) → alarm Active. + eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active, + "AreInputsReady accepts Uncertain so the predicate runs"); + + // But the resolved emission message must show "{?}" for the Uncertain + // tag — only Good substitutes into the operator-facing message. + var events = new List(); + eng.OnEvent += (_, e) => events.Add(e); + up.Push("Temp", 200, statusCode: 0x40000000u); // still Uncertain + // Trigger another evaluation to get an emission (already active, so + // we need a clear → re-activate cycle). Easier: force the same path + // through a comment which emits a CommentAdded message. But comments + // don't run the template. Instead clear it then re-activate. + up.Push("Temp", 50, statusCode: 0u); // Good, predicate becomes false + await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Cleared)); + events.Clear(); + up.Push("Temp", 200, statusCode: 0x40000000u); // Uncertain, predicate true + await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Activated)); + + // The Activated message must show {?} for the Uncertain input. + events.Single(e => e.Emission == EmissionKind.Activated).Message + .ShouldBe("Temp {?} exceeded limit", + "MessageTemplate.Resolve renders non-Good StatusCode as {?} " + + "even though predicate evaluation accepted the Uncertain value"); + } + + // ------------------------------------------------------------------------- + // Core.ScriptedAlarms-008: switch Comments to ImmutableList for O(log n) + // append. The persisted runtime type must be ImmutableList + // (which still satisfies IReadOnlyList for existing + // consumers). + // ------------------------------------------------------------------------- + [Fact] + public async Task Comments_collection_uses_ImmutableList_for_efficient_append(/* -008 */) + { + var up = new FakeUpstream(); + up.Set("Temp", 50); + using var eng = Build(up, out _); + await eng.LoadAsync([Alarm("A", "return false;")], TestContext.Current.CancellationToken); + + // Add a comment so AppendComment runs. + await eng.AddCommentAsync("A", "alice", "note", TestContext.Current.CancellationToken); + + var s = eng.GetState("A")!; + s.Comments.ShouldBeOfType>( + "Comments should be an ImmutableList so append is O(log n), not O(n)"); + } + + // ------------------------------------------------------------------------- + // Core.ScriptedAlarms-011: TransitionResult.NoOp's reason parameter must be + // propagated, not silently discarded. The class-level remarks promise a + // diagnostic log line for no-op disabled-alarm evaluations. + // ------------------------------------------------------------------------- + [Fact] + public void TransitionResult_NoOp_propagates_reason(/* -011 */) + { + var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow); + var r = TransitionResult.NoOp(fresh, "disabled — predicate result ignored"); + r.NoOpReason.ShouldBe("disabled — predicate result ignored", + "NoOp reason must be preserved on the TransitionResult so callers can log it"); + } + + [Fact] + public void TransitionResult_None_carries_no_reason(/* -011 */) + { + var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow); + var r = TransitionResult.None(fresh); + r.NoOpReason.ShouldBeNull("None() factory has no reason — only NoOp() carries one"); + } + private static async Task WaitForAsync(Func cond, int timeoutMs = 2000) { var deadline = DateTime.UtcNow.AddMilliseconds(timeoutMs); @@ -645,4 +892,37 @@ public sealed class ScriptedAlarmEngineTests public Task RemoveAsync(string alarmId, CancellationToken ct) => _inner.RemoveAsync(alarmId, ct); } + + /// + /// A store whose SaveAsync can be made to block until the test signals it. + /// Used to verify Dispose drains in-flight background tasks (finding -006). + /// + private sealed class BlockingSaveAlarmStateStore : IAlarmStateStore + { + private readonly InMemoryAlarmStateStore _inner = new(); + public TaskCompletionSource? BlockNextSave { get; set; } + public bool SaveInProgress { get; private set; } + + public Task LoadAsync(string alarmId, CancellationToken ct) + => _inner.LoadAsync(alarmId, ct); + + public Task> LoadAllAsync(CancellationToken ct) + => _inner.LoadAllAsync(ct); + + public async Task SaveAsync(AlarmConditionState state, CancellationToken ct) + { + var gate = BlockNextSave; + if (gate is not null) + { + BlockNextSave = null; + SaveInProgress = true; + try { await gate.Task.WaitAsync(ct).ConfigureAwait(false); } + finally { SaveInProgress = false; } + } + await _inner.SaveAsync(state, ct).ConfigureAwait(false); + } + + public Task RemoveAsync(string alarmId, CancellationToken ct) + => _inner.RemoveAsync(alarmId, ct); + } }