From e3f8fa535a360c299e67f595c1af14bcd6b6bed6 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 22 May 2026 06:17:44 -0400 Subject: [PATCH] fix(scripted-alarms): resolve High code-review finding (Core.ScriptedAlarms-001) _alarms was a plain Dictionary mutated under the _evalGate semaphore, but four read paths (GetState, GetAllStates, the LoadedAlarmIds property, and RunShelvingCheck) touched it from arbitrary threads with no synchronisation. A Dictionary read concurrent with a writer's entry reassignment can throw InvalidOperationException or return torn state. Switched _alarms to ConcurrentDictionary. The only write shapes are indexer-set and Clear, both atomic on ConcurrentDictionary, so all mutations stay correct without further change; reads now get safe snapshot semantics. LoadedAlarmIds materialises the key snapshot to keep its IReadOnlyCollection return type. This matches _valueCache, which is already a ConcurrentDictionary. Added a regression test (Concurrent_reads_during_mutation_do_not_throw) that hammers the engine with state mutations while four reader threads continuously call the three unguarded read paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- code-reviews/Core.ScriptedAlarms/findings.md | 6 +- .../ScriptedAlarmEngine.cs | 12 +++- .../ScriptedAlarmEngineTests.cs | 61 +++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/code-reviews/Core.ScriptedAlarms/findings.md b/code-reviews/Core.ScriptedAlarms/findings.md index a3449e5..971bcba 100644 --- a/code-reviews/Core.ScriptedAlarms/findings.md +++ b/code-reviews/Core.ScriptedAlarms/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-05-22 | | Commit reviewed | `76d35d1` | | Status | Reviewed | -| Open findings | 12 | +| Open findings | 11 | ## Checklist coverage @@ -36,13 +36,13 @@ a category produced nothing rather than leaving it blank. | Severity | High | | Category | Concurrency & thread safety | | Location | `ScriptedAlarmEngine.cs:175`, `ScriptedAlarmEngine.cs:178`, `ScriptedAlarmEngine.cs:73`, `ScriptedAlarmEngine.cs:368` | -| Status | Open | +| Status | Resolved | **Description:** `_alarms` is a plain `Dictionary` (line 42). Every mutation of it (`LoadAsync`, `ApplyAsync`, `ReevaluateAsync`, `ShelvingCheckAsync`) correctly happens under the `_evalGate` semaphore, but four read paths touch it with no synchronisation: `GetState` (line 175), `GetAllStates` (line 178-179), the `LoadedAlarmIds` property (line 73), and `RunShelvingCheck` (line 368, `_alarms.Keys.ToArray()`). `RunShelvingCheck` fires from a `Timer` thread-pool callback and can run concurrently with an `ApplyAsync`/`ReevaluateAsync` that is reassigning a dictionary entry. `Dictionary` is not safe for concurrent read while another thread writes — even a value reassignment can be observed mid-rehash and throw `InvalidOperationException` or return torn state. `GetState`/`GetAllStates` are documented as being used by the Admin UI status page, so these reads come from arbitrary request threads. **Recommendation:** Either switch `_alarms` to `ConcurrentDictionary` (entry reassignment via `_alarms[id] = ...` is already the only write shape, which a `ConcurrentDictionary` supports atomically), or acquire `_evalGate` in every reader. A `ConcurrentDictionary` is the lighter change and matches `_valueCache`, which is already concurrent. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-22 — switched `_alarms` to `ConcurrentDictionary` so the four unguarded read paths are safe against concurrent under-gate entry reassignment; added a concurrency regression test. ### Core.ScriptedAlarms-002 diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs index 50bcea7..6ce5e52 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms/ScriptedAlarmEngine.cs @@ -39,7 +39,15 @@ public sealed class ScriptedAlarmEngine : IDisposable private readonly Func _clock; private readonly TimeSpan _scriptTimeout; - private readonly Dictionary _alarms = new(StringComparer.Ordinal); + // ConcurrentDictionary, not a plain Dictionary: every mutation happens under + // _evalGate, but four read paths (GetState, GetAllStates, LoadedAlarmIds, + // RunShelvingCheck) touch _alarms from arbitrary threads (Admin UI request + // threads, the shelving Timer thread-pool callback) without holding the gate. + // A plain Dictionary read concurrent with a writer's entry reassignment can + // throw or return torn state; ConcurrentDictionary makes entry assignment and + // snapshot enumeration safe. The only write shapes are indexer-set and Clear, + // both of which ConcurrentDictionary supports atomically. (Core.ScriptedAlarms-001) + private readonly ConcurrentDictionary _alarms = new(StringComparer.Ordinal); private readonly ConcurrentDictionary _valueCache = new(StringComparer.Ordinal); private readonly Dictionary> _alarmsReferencing @@ -70,7 +78,7 @@ public sealed class ScriptedAlarmEngine : IDisposable /// Raised for every emission the Part9StateMachine produces that the engine should publish. public event EventHandler? OnEvent; - public IReadOnlyCollection LoadedAlarmIds => _alarms.Keys; + public IReadOnlyCollection LoadedAlarmIds => _alarms.Keys.ToArray(); /// /// Load a batch of alarm definitions. Compiles every predicate, aggregates any diff --git a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs index d88ec5d..f03788a 100644 --- a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs +++ b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms.Tests/ScriptedAlarmEngineTests.cs @@ -303,6 +303,67 @@ public sealed class ScriptedAlarmEngineTests up.ActiveSubscriptionCount.ShouldBe(0); } + [Fact] + public async Task Concurrent_reads_during_mutation_do_not_throw(/* Core.ScriptedAlarms-001 */) + { + // Regression for Core.ScriptedAlarms-001: GetState / GetAllStates / + // LoadedAlarmIds touch _alarms from arbitrary threads with no lock while + // ApplyAsync / ReevaluateAsync reassign dictionary entries under _evalGate. + // With a plain Dictionary this race throws InvalidOperationException or + // returns torn state; with a ConcurrentDictionary the reads are safe. + var up = new FakeUpstream(); + const int alarmCount = 40; + var defs = new List(); + for (var i = 0; i < alarmCount; i++) + { + up.Set($"Temp{i}", 50); + defs.Add(Alarm($"A{i}", $$"""return (int)ctx.GetTag("Temp{{i}}").Value > 100;""")); + } + + using var eng = Build(up, out _); + await eng.LoadAsync(defs, TestContext.Current.CancellationToken); + + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2)); + Exception? readerFailure = null; + + // Writer: hammer the engine with state mutations that reassign _alarms entries. + var writer = Task.Run(async () => + { + var round = 0; + while (!cts.IsCancellationRequested) + { + var id = $"A{round % alarmCount}"; + await eng.AddCommentAsync(id, "tester", $"r{round}", CancellationToken.None); + up.Push($"Temp{round % alarmCount}", round % 2 == 0 ? 150 : 50); + round++; + } + }); + + // Readers: continuously touch the three unguarded read paths. + var readers = Enumerable.Range(0, 4).Select(_reader => Task.Run(() => + { + try + { + while (!cts.IsCancellationRequested) + { + _ = eng.LoadedAlarmIds.Count; + _ = eng.GetAllStates().Count; + for (var i = 0; i < alarmCount; i++) + _ = eng.GetState($"A{i}"); + } + } + catch (Exception ex) + { + readerFailure = ex; + } + })).ToArray(); + + await Task.WhenAll([writer, .. readers]); + + readerFailure.ShouldBeNull( + "concurrent reads of _alarms while it is being mutated must not throw"); + } + private static async Task WaitForAsync(Func cond, int timeoutMs = 2000) { var deadline = DateTime.UtcNow.AddMilliseconds(timeoutMs);