From be274212f0ea11c3f955565bc744c25fc0a84773 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 17 May 2026 03:18:41 -0400 Subject: [PATCH] =?UTF-8?q?fix(site-runtime):=20resolve=20SiteRuntime-017.?= =?UTF-8?q?.019=20=E2=80=94=20isolated=20attribute=20snapshot=20for=20chil?= =?UTF-8?q?d=20actors,=20corrected=20dispatcher=20doc,=20remove=20dead=20l?= =?UTF-8?q?ifecycle=20handlers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code-reviews/SiteRuntime/findings.md | 44 ++++- .../Actors/AlarmActor.cs | 9 + .../Actors/InstanceActor.cs | 40 ++-- .../Actors/ScriptActor.cs | 10 + .../Actors/ScriptExecutionActor.cs | 7 +- .../ScadaLink.SiteRuntime.csproj | 4 + .../InstanceActorChildAttributeRaceTests.cs | 187 ++++++++++++++++++ .../Actors/InstanceActorTests.cs | 28 +++ 8 files changed, 303 insertions(+), 26 deletions(-) create mode 100644 tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorChildAttributeRaceTests.cs diff --git a/code-reviews/SiteRuntime/findings.md b/code-reviews/SiteRuntime/findings.md index 90904c5..1f0cd70 100644 --- a/code-reviews/SiteRuntime/findings.md +++ b/code-reviews/SiteRuntime/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-17 | | Reviewer | claude-agent | | Commit reviewed | `39d737e` | -| Open findings | 3 | +| Open findings | 0 | ## Summary @@ -44,7 +44,8 @@ dictionary is handed by reference into child `ScriptActor`/`AlarmActor` constructors (SiteRuntime-017, Medium); a stale `ScriptExecutionActor` XML doc that still claims a "dedicated blocking I/O dispatcher" (SiteRuntime-018, Low); and two dead lifecycle handlers in `InstanceActor` that the Deployment Manager -never routes to (SiteRuntime-019, Low). Open findings: 3. +never routes to (SiteRuntime-019, Low). All three were subsequently resolved on +2026-05-17. Open findings: 0. ## Checklist coverage @@ -758,7 +759,7 @@ green. |--|--| | Severity | Medium | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:625`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:675`, `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:83`, `src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs:93` | **Description** @@ -800,7 +801,18 @@ private dictionary to seed from. **Resolution** -_Unresolved._ +Resolved 2026-05-17 (`commit pending`): root cause confirmed — `CreateChildActors` +captured the live `_attributes` field directly in every child `Props.Create` +closure. `CreateChildActors` now takes a single `new Dictionary(_attributes)` +snapshot on the Instance Actor thread and hands each `ScriptActor`/`AlarmActor` that +private copy, so no child constructor ever enumerates a dictionary the Instance +Actor is concurrently mutating. Regression test: +`InstanceActorChildAttributeRaceTests.ChildActors_AreSeededFromAnIsolatedCopy_NotTheLiveAttributesDictionary` +asserts every child's seed dictionary is a distinct object from the Instance +Actor's live `_attributes` (confirmed to fail — "seeded ... by reference" — against +the pre-fix code and pass after). `ScriptActor`/`AlarmActor` expose an internal +`SeedAttributesReference` for this assertion (`InternalsVisibleTo` added for the +test project). ### SiteRuntime-018 — `ScriptExecutionActor` XML doc still claims a "dedicated blocking I/O dispatcher" @@ -808,7 +820,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs:17` | **Description** @@ -835,7 +847,13 @@ comment already present at `ScriptExecutionActor.cs:71-73`. **Resolution** -_Unresolved._ +Resolved 2026-05-17 (`commit pending`): root cause confirmed — the stale +"Runs on a dedicated blocking I/O dispatcher" line in the `ScriptExecutionActor` +class summary was missed when SiteRuntime-009 was resolved. The summary now states +the actual model: the actor and its mailbox run on the default Akka dispatcher and +only the script body is dispatched onto the dedicated `ScriptExecutionScheduler` +(SiteRuntime-009). Documentation-only change with no observable behaviour, so no +regression test was added; the existing suite continues to pass. ### SiteRuntime-019 — Dead `DisableInstanceCommand` / `EnableInstanceCommand` handlers in `InstanceActor` @@ -843,7 +861,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:106`, `src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs:113` | **Description** @@ -873,4 +891,14 @@ reserved placeholder — but removal is preferred. **Resolution** -_Unresolved._ +Resolved 2026-05-17 (`commit pending`): re-verified as genuinely dead code — a +codebase-wide search confirms `DisableInstanceCommand`/`EnableInstanceCommand` are +only ever sent (from central) to the site `DeploymentManagerActor`, whose +`HandleDisable`/`HandleEnable` own the lifecycle entirely (`Context.Stop` / +`CreateInstanceActor`) and never `Forward`/`Tell` the command to the Instance +Actor. The two unreachable `Receive<...>` registrations and their no-op +"success" handler bodies were removed from `InstanceActor`, replaced with a comment +stating the Deployment Manager owns this lifecycle. Regression test: +`InstanceActorTests.InstanceActor_DoesNotHandleDisableOrEnableCommands` asserts the +Instance Actor produces no `InstanceLifecycleResponse` for either command +(confirmed to fail against the pre-fix dead handlers and pass after removal). diff --git a/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs b/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs index 72e1686..39c0466 100644 --- a/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs @@ -56,6 +56,14 @@ public class AlarmActor : ReceiveActor private readonly Script? _compiledTriggerExpression; private readonly Dictionary _attributeSnapshot = new(); + /// + /// SiteRuntime-017: the exact dictionary instance this actor was seeded from + /// at construction. The Instance Actor must pass a private snapshot here, not + /// its live _attributes field. Exposed for regression coverage of that + /// isolation contract. + /// + internal IReadOnlyDictionary? SeedAttributesReference { get; } + // Rate of change tracking private readonly Queue<(DateTimeOffset Timestamp, double Value)> _rateOfChangeWindow = new(); private readonly TimeSpan _rateOfChangeWindowDuration; @@ -90,6 +98,7 @@ public class AlarmActor : ReceiveActor // Seed the trigger-expression attribute snapshot from the instance's // initial attribute set so static attributes (which never re-emit an // AttributeValueChanged after deploy) evaluate correctly at startup. + SeedAttributesReference = initialAttributes; if (initialAttributes != null) { foreach (var kvp in initialAttributes) diff --git a/src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs b/src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs index 53da639..cad327e 100644 --- a/src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/InstanceActor.cs @@ -4,7 +4,6 @@ using Microsoft.Extensions.Logging; using ScadaLink.Commons.Messages.DataConnection; using ScadaLink.Commons.Messages.DebugView; using ScadaLink.Commons.Messages.Instance; -using ScadaLink.Commons.Messages.Lifecycle; using ScadaLink.Commons.Messages.ScriptExecution; using ScadaLink.Commons.Messages.Streaming; using ScadaLink.Commons.Types.Enums; @@ -102,20 +101,13 @@ public class InstanceActor : ReceiveActor // Handle static attribute writes Receive(HandleSetStaticAttribute); - // Handle lifecycle messages - Receive(_ => - { - _logger.LogInformation("Instance {Instance} received disable command", _instanceUniqueName); - Sender.Tell(new InstanceLifecycleResponse( - _.CommandId, _instanceUniqueName, true, null, DateTimeOffset.UtcNow)); - }); - - Receive(_ => - { - _logger.LogInformation("Instance {Instance} received enable command", _instanceUniqueName); - Sender.Tell(new InstanceLifecycleResponse( - _.CommandId, _instanceUniqueName, true, null, DateTimeOffset.UtcNow)); - }); + // SiteRuntime-019: the disable/enable lifecycle is owned entirely by the + // Deployment Manager — DeploymentManagerActor.HandleDisable/HandleEnable + // stop or re-create the Instance Actor directly and reply to the caller. + // DisableInstanceCommand / EnableInstanceCommand are never routed to the + // Instance Actor, so no handlers are registered here. (The previous no-op + // handlers were dead code that implied a non-existent instance-side + // acknowledgement contract.) // WP-15: Handle script call requests — route to appropriate Script Actor (Ask pattern) Receive(HandleScriptCallRequest); @@ -590,11 +582,25 @@ public class InstanceActor : ReceiveActor /// WP-15: Script Actors spawned per script definition. /// WP-16: Alarm Actors spawned per alarm definition, as peers to Script Actors. /// WP-32: Compilation errors reject entire instance deployment (logged but actor still starts). + /// + /// SiteRuntime-017: each child is seeded from a private point-in-time snapshot + /// of _attributes, NOT the live dictionary. The snapshot is taken here on + /// the Instance Actor thread, so it is race-free; handing the live mutable + /// by reference + /// would let a child constructor enumerate it on the child's mailbox thread while + /// this actor mutates it in HandleAttributeValueChanged. /// private void CreateChildActors() { if (_configuration == null) return; + // SiteRuntime-017: snapshot the live attribute dictionary once, on the + // Instance Actor thread, before any child is constructed. Each child + // Props closure captures this immutable copy instead of the mutable + // _attributes field, so no child constructor ever enumerates a + // dictionary this actor is concurrently mutating. + var attributeSnapshot = new Dictionary(_attributes); + // Create Script Actors foreach (var script in _configuration.Scripts) { @@ -622,7 +628,7 @@ public class InstanceActor : ReceiveActor _options, _logger, triggerExpression, - _attributes, + attributeSnapshot, _healthCollector, _serviceProvider)); @@ -672,7 +678,7 @@ public class InstanceActor : ReceiveActor _options, _logger, triggerExpression, - _attributes, + attributeSnapshot, _healthCollector)); var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}"); diff --git a/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs b/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs index 13017fa..793fdec 100644 --- a/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs @@ -48,6 +48,15 @@ public class ScriptActor : ReceiveActor, IWithTimers private bool _lastExpressionResult; private readonly Dictionary _attributeSnapshot = new(); + /// + /// SiteRuntime-017: the exact dictionary instance this actor was seeded from + /// at construction. The Instance Actor must pass a private snapshot here, not + /// its live _attributes field — sharing the live dictionary lets this + /// constructor enumerate it while the Instance Actor mutates it on another + /// thread. Exposed for regression coverage of that isolation contract. + /// + internal IReadOnlyDictionary? SeedAttributesReference { get; } + public ITimerScheduler Timers { get; set; } = null!; public ScriptActor( @@ -80,6 +89,7 @@ public class ScriptActor : ReceiveActor, IWithTimers // Seed the trigger-expression attribute snapshot from the instance's // initial attribute set so static attributes (which never re-emit an // AttributeValueChanged after deploy) evaluate correctly at startup. + SeedAttributesReference = initialAttributes; if (initialAttributes != null) { foreach (var kvp in initialAttributes) diff --git a/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs b/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs index 3b53eba..233f81b 100644 --- a/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs @@ -14,9 +14,14 @@ namespace ScadaLink.SiteRuntime.Actors; /// /// WP-15: Script Execution Actor -- short-lived child of Script Actor. /// Receives compiled code, params, Instance Actor ref, and call depth. -/// Runs on a dedicated blocking I/O dispatcher. /// Executes the script via Script Runtime API, returns result, then stops. /// +/// The actor itself and its mailbox run on the default Akka dispatcher; only the +/// script body is dispatched off the actor thread, onto the dedicated +/// +/// (SiteRuntime-009), so blocking script I/O cannot starve the shared thread pool +/// or stall other Akka dispatchers. +/// /// WP-32: Script failures are logged but do not disable the script. /// Supervision: Stop on unhandled exception (parent ScriptActor decides). /// diff --git a/src/ScadaLink.SiteRuntime/ScadaLink.SiteRuntime.csproj b/src/ScadaLink.SiteRuntime/ScadaLink.SiteRuntime.csproj index 2ddecd3..bfa3f5b 100644 --- a/src/ScadaLink.SiteRuntime/ScadaLink.SiteRuntime.csproj +++ b/src/ScadaLink.SiteRuntime/ScadaLink.SiteRuntime.csproj @@ -20,6 +20,10 @@ + + + + diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorChildAttributeRaceTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorChildAttributeRaceTests.cs new file mode 100644 index 0000000..c435006 --- /dev/null +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorChildAttributeRaceTests.cs @@ -0,0 +1,187 @@ +using Akka.Actor; +using Akka.TestKit; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.Commons.Messages.Streaming; +using ScadaLink.Commons.Types.Flattening; +using ScadaLink.SiteRuntime.Actors; +using ScadaLink.SiteRuntime.Persistence; +using ScadaLink.SiteRuntime.Scripts; +using System.Reflection; +using System.Text.Json; + +namespace ScadaLink.SiteRuntime.Tests.Actors; + +/// +/// Regression coverage for SiteRuntime-017 — the Instance Actor must not hand its +/// own live mutable _attributes dictionary by reference into the +/// / constructors. +/// +/// Each child constructor runs on the child's own mailbox thread and seeds itself +/// by enumerating the dictionary it was given. The Instance Actor concurrently +/// mutates _attributes in HandleAttributeValueChanged / +/// HandleTagValueUpdate. is not safe +/// for concurrent read/write: if a child enumerates the shared live dictionary +/// while the Instance Actor inserts into it, the child constructor throws +/// ("collection was modified") — surfacing +/// as ActorInitializationException and stopping the child. +/// +/// The fix: CreateChildActors snapshots _attributes once on the +/// Instance Actor thread (new Dictionary<,>(_attributes)) and hands +/// each child that private copy. This test asserts the isolation contract +/// directly and deterministically: every child's seed dictionary must be a +/// distinct object from the Instance Actor's live _attributes, while still +/// carrying the same point-in-time contents. +/// +public class InstanceActorChildAttributeRaceTests : TestKit, IDisposable +{ + private readonly SiteStorageService _storage; + private readonly ScriptCompilationService _compilationService; + private readonly SharedScriptLibrary _sharedScriptLibrary; + private readonly SiteRuntimeOptions _options; + private readonly string _dbFile; + + public InstanceActorChildAttributeRaceTests() + { + _dbFile = Path.Combine(Path.GetTempPath(), $"instance-race-test-{Guid.NewGuid():N}.db"); + _storage = new SiteStorageService( + $"Data Source={_dbFile}", + NullLogger.Instance); + _storage.InitializeAsync().GetAwaiter().GetResult(); + _compilationService = new ScriptCompilationService( + NullLogger.Instance); + _sharedScriptLibrary = new SharedScriptLibrary( + _compilationService, NullLogger.Instance); + _options = new SiteRuntimeOptions + { + MaxScriptCallDepth = 10, + ScriptExecutionTimeoutSeconds = 30 + }; + } + + void IDisposable.Dispose() + { + Shutdown(); + try { File.Delete(_dbFile); } catch { /* cleanup */ } + } + + private static FlattenedConfiguration BuildConfig(string instanceName) + => new() + { + InstanceUniqueName = instanceName, + Attributes = + [ + new ResolvedAttribute { CanonicalName = "Temperature", Value = "98.6", DataType = "Double" }, + new ResolvedAttribute { CanonicalName = "Pressure", Value = "12", DataType = "Int32" }, + new ResolvedAttribute { CanonicalName = "Label", Value = "Main Pump", DataType = "String" } + ], + Scripts = + [ + new ResolvedScript + { + CanonicalName = "WorkerA", Code = "return 1;", + TriggerType = "ValueChange", + TriggerConfiguration = "{\"AttributeName\":\"Temperature\"}" + }, + new ResolvedScript + { + CanonicalName = "WorkerB", Code = "return 2;", + TriggerType = "ValueChange", + TriggerConfiguration = "{\"AttributeName\":\"Pressure\"}" + } + ], + Alarms = + [ + new ResolvedAlarm + { + CanonicalName = "HighTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{}", + PriorityLevel = 1 + } + ] + }; + + /// Resolves the live actor instance behind a local . + private static object GetActorInstance(IActorRef actorRef) + { + var cell = ((ActorRefWithCell)actorRef).Underlying; + // ActorCell exposes the actor instance via its internal Actor property. + var actorProp = cell.GetType().GetProperty( + "Actor", BindingFlags.Instance | BindingFlags.NonPublic | BindingFlags.Public); + var instance = actorProp!.GetValue(cell); + Assert.NotNull(instance); + return instance!; + } + + private static Dictionary GetPrivateAttributes(InstanceActor instance) + { + var field = typeof(InstanceActor).GetField( + "_attributes", BindingFlags.Instance | BindingFlags.NonPublic); + return (Dictionary)field!.GetValue(instance)!; + } + + [Fact] + public async Task ChildActors_AreSeededFromAnIsolatedCopy_NotTheLiveAttributesDictionary() + { + const string instanceName = "RacePump"; + var config = BuildConfig(instanceName); + + var testRef = ActorOfAsTestActorRef( + Props.Create(() => new InstanceActor( + instanceName, + JsonSerializer.Serialize(config), + _storage, + _compilationService, + _sharedScriptLibrary, + null, + _options, + NullLogger.Instance)), + "instance"); + + var instanceActor = testRef.UnderlyingActor; + var liveAttributes = GetPrivateAttributes(instanceActor); + + // Sanity: the children were created. + Assert.Equal(2, instanceActor.ScriptActorCount); + Assert.Equal(1, instanceActor.AlarmActorCount); + + // Every child Script Actor must have been seeded from a dictionary that + // is NOT the Instance Actor's live _attributes field — otherwise the + // child constructor would enumerate a dictionary the Instance Actor + // mutates on another thread (SiteRuntime-017). + foreach (var name in new[] { "WorkerA", "WorkerB" }) + { + var child = await Sys.ActorSelection(testRef.Path / $"script-{name}") + .ResolveOne(TimeSpan.FromSeconds(5)); + var scriptActor = (ScriptActor)GetActorInstance(child); + + Assert.NotNull(scriptActor.SeedAttributesReference); + Assert.False( + ReferenceEquals(scriptActor.SeedAttributesReference, liveAttributes), + $"Script Actor '{name}' was seeded from the Instance Actor's live " + + "_attributes dictionary by reference (SiteRuntime-017). It must be " + + "given a private snapshot copy."); + + // The snapshot must still carry the same point-in-time contents. + Assert.Equal(liveAttributes.Count, scriptActor.SeedAttributesReference!.Count); + foreach (var kvp in liveAttributes) + { + Assert.True(scriptActor.SeedAttributesReference.ContainsKey(kvp.Key)); + } + } + + // The Alarm Actor must likewise be seeded from an isolated copy. + var alarmChild = await Sys.ActorSelection(testRef.Path / "alarm-HighTemp") + .ResolveOne(TimeSpan.FromSeconds(5)); + var alarmActor = (AlarmActor)GetActorInstance(alarmChild); + + Assert.NotNull(alarmActor.SeedAttributesReference); + Assert.False( + ReferenceEquals(alarmActor.SeedAttributesReference, liveAttributes), + "Alarm Actor 'HighTemp' was seeded from the Instance Actor's live " + + "_attributes dictionary by reference (SiteRuntime-017). It must be " + + "given a private snapshot copy."); + Assert.Equal(liveAttributes.Count, alarmActor.SeedAttributesReference!.Count); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorTests.cs index 2f08d35..dce9659 100644 --- a/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorTests.cs +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/InstanceActorTests.cs @@ -3,6 +3,7 @@ using Akka.TestKit.Xunit2; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using ScadaLink.Commons.Messages.Instance; +using ScadaLink.Commons.Messages.Lifecycle; using ScadaLink.Commons.Types.Flattening; using ScadaLink.SiteRuntime.Actors; using ScadaLink.SiteRuntime.Persistence; @@ -251,6 +252,33 @@ public class InstanceActorTests : TestKit, IDisposable Assert.Equal("Uncertain", response.Quality); } + /// + /// SiteRuntime-019: the disable/enable lifecycle is owned entirely by the + /// Deployment Manager (it stops / re-creates the Instance Actor itself and + /// replies to the caller). The Instance Actor must NOT handle + /// / + /// — the dead handlers that replied with a misleading "success" + /// acknowledgement were removed. Sending one to the Instance Actor now goes + /// unhandled and produces no . + /// + [Fact] + public void InstanceActor_DoesNotHandleDisableOrEnableCommands() + { + var config = new FlattenedConfiguration + { + InstanceUniqueName = "Pump1", + Attributes = [] + }; + + var actor = CreateInstanceActor("Pump1", config); + + actor.Tell(new DisableInstanceCommand("cmd-disable", "Pump1", DateTimeOffset.UtcNow)); + ExpectNoMsg(TimeSpan.FromMilliseconds(500)); + + actor.Tell(new EnableInstanceCommand("cmd-enable", "Pump1", DateTimeOffset.UtcNow)); + ExpectNoMsg(TimeSpan.FromMilliseconds(500)); + } + [Fact] public void InstanceActor_StaticAttribute_StartsWithGoodQuality() {