diff --git a/code-reviews/Communication/findings.md b/code-reviews/Communication/findings.md index 3df4d73..a62220e 100644 --- a/code-reviews/Communication/findings.md +++ b/code-reviews/Communication/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 3 | +| Open findings | 0 | ## Summary @@ -405,7 +405,7 @@ per-event reset (`Grpc_Error_Resets_RetryCount_On_Successful_Event`) was replace |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:53`, `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:240` | **Description** @@ -427,7 +427,16 @@ malformed site record cannot abort the whole refresh and leave a half-updated ca **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending). Root cause confirmed: `_siteClients` was a +non-`readonly` field, and `HandleSiteAddressCacheLoaded`'s add/update loop called +`ActorPath.Parse` per site with no guard — a malformed `NodeAAddress` threw mid-loop and +aborted the refresh, leaving the cache half-updated until the next 60s cycle. Fix: +`_siteClients` is now `readonly`, and the per-site `ActorPath.Parse` is wrapped in a +try/catch that logs the bad address at Warning and `continue`s to the next site, so a +single garbage row cannot starve other sites of their ClusterClient. Regression test +`CentralCommunicationActorTests.MalformedSiteAddress_DoesNotAbortRefresh_OtherSitesStillRegistered` +(bad site ordered before a good one) fails against the pre-fix code (good site never +registered) and passes after. ### Communication-010 — `DebugStreamBridgeActor` XML doc incorrectly describes it as a "Persistent actor" @@ -435,7 +444,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Documentation & comments | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:10` | **Description** @@ -453,7 +462,13 @@ side..." or similar, removing the word "Persistent". **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending). Root cause confirmed: the class summary opened +with "Persistent actor (one per active debug session)..." but the actor derives from +`ReceiveActor`, holds no `PersistenceId`, and writes no journal/snapshot. Fix +(documentation only — no behaviour change, so no regression test): the summary was +reworded to "Long-lived (one per active debug session) actor on the central side. Debug +sessions are session-based and temporary — this actor holds no persisted state and does +not derive from an Akka.Persistence base class; its state does not survive a restart." ### Communication-011 — No test coverage for snapshot-timeout cleanup, address-cache failure, or gRPC reconnect leak @@ -461,7 +476,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Testing coverage | -| Status | Open | +| Status | Resolved | | Location | `tests/ScadaLink.Communication.Tests/` (module-wide) | **Description** @@ -487,4 +502,20 @@ failure logging and empty-cache behaviour; reusing a correlation ID across **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit pending). This is a meta-coverage finding; every gap it +enumerates is now covered by a regression test (each fails against its pre-fix code and +passes after): +- Snapshot timeout / pre-snapshot termination (Communication-001) — + `DebugStreamServiceTests.StartStreamAsync_StreamTerminatesBeforeSnapshot_ThrowsMeaningfulException`. +- gRPC reconnect not unsubscribing the old node (Communication-002) — + `DebugStreamBridgeActorTests.On_GrpcError_Unsubscribes_Old_Stream_Before_Reconnect`. +- `SiteStreamGrpcClient` subscription-map overwrite/removal race (Communication-003) — + `SiteStreamGrpcClientTests.RegisterSubscription_ReusedCorrelationId_CancelsAndDisposesPriorCts` + and `RemoveSubscription_OnlyRemovesOwnCts_NotAReplacement`. +- `LoadSiteAddressesFromDb` fault (Communication-006) — + `CentralCommunicationActorTests.LoadSiteAddressesFailure_IsLoggedNotSilentlySwallowed`. +- Malformed `NodeAAddress` aborting `HandleSiteAddressCacheLoaded` (Communication-009) — + `CentralCommunicationActorTests.MalformedSiteAddress_DoesNotAbortRefresh_OtherSitesStillRegistered` + (added with this finding's resolution). +The full module suite (`dotnet test tests/ScadaLink.Communication.Tests`) is green at +111 passing tests. diff --git a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs index 5c8e0e7..e00d9e4 100644 --- a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs +++ b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs @@ -50,7 +50,7 @@ public class CentralCommunicationActor : ReceiveActor /// Maps SiteIdentifier → (ClusterClient actor, set of contact address strings). /// Refreshed periodically via RefreshSiteAddresses. /// - private Dictionary ContactAddresses)> _siteClients = new(); + private readonly Dictionary ContactAddresses)> _siteClients = new(); /// /// Tracks active debug view subscriptions: correlationId → (siteId, subscriber). @@ -262,9 +262,23 @@ public class CentralCommunicationActor : ReceiveActor // Add or update foreach (var (siteId, addresses) in msg.SiteContacts) { - var contactPaths = addresses - .Select(a => ActorPath.Parse($"{a}/system/receptionist")) - .ToImmutableHashSet(); + // Communication-009: parse all addresses up front inside a try/catch so a + // single malformed site row cannot abort the whole refresh loop and leave + // the cache half-updated. A bad site is logged and skipped; others proceed. + ImmutableHashSet contactPaths; + try + { + contactPaths = addresses + .Select(a => ActorPath.Parse($"{a}/system/receptionist")) + .ToImmutableHashSet(); + } + catch (Exception ex) + { + _log.Warning(ex, + "Malformed contact address for site {0}; skipping this site in the refresh " + + "(other sites are unaffected)", siteId); + continue; + } var contactStrings = addresses.ToImmutableHashSet(); diff --git a/src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs b/src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs index e2d7d9a..6c104e7 100644 --- a/src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs +++ b/src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs @@ -7,7 +7,9 @@ using ScadaLink.Communication.Grpc; namespace ScadaLink.Communication.Actors; /// -/// Persistent actor (one per active debug session) on the central side. +/// Long-lived (one per active debug session) actor on the central side. Debug sessions +/// are session-based and temporary — this actor holds no persisted state and does not +/// derive from an Akka.Persistence base class; its state does not survive a restart. /// Sends SubscribeDebugViewRequest to the site via CentralCommunicationActor (with THIS actor /// as the Sender) to get the initial snapshot. After receiving the snapshot, opens a gRPC /// server-streaming subscription via SiteStreamGrpcClient for ongoing events. diff --git a/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs b/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs index 383895f..2a60ff5 100644 --- a/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs +++ b/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs @@ -222,6 +222,35 @@ public class CentralCommunicationActorTests : TestKit }); } + [Fact] + public void MalformedSiteAddress_DoesNotAbortRefresh_OtherSitesStillRegistered() + { + // Regression test for Communication-009. HandleSiteAddressCacheLoaded calls + // ActorPath.Parse for every site in a single loop. A malformed NodeAAddress + // throws inside that loop; before the fix the whole refresh aborted partway + // through, leaving the cache half-updated (some sites registered, others not). + // The fix wraps the parse in a try/catch that logs and skips the bad site so + // a single garbage row cannot starve every other site of its ClusterClient. + var goodSite = CreateSite("good-site", "akka.tcp://scadalink@host1:8082"); + // A garbage address that ActorPath.Parse rejects. + var badSite = CreateSite("bad-site", "this is not a valid actor path !!!"); + + // Order the bad site first so a non-resilient loop aborts before reaching good-site. + var (actor, _, siteProbes) = CreateActorWithMockRepo(new[] { badSite, goodSite }); + + Thread.Sleep(1000); + + // good-site must still be registered and routable despite bad-site failing to parse. + var cmd = new DeployInstanceCommand( + "dep1", "inst1", "hash1", "{}", "admin", DateTimeOffset.UtcNow); + actor.Tell(new SiteEnvelope("good-site", cmd)); + + Assert.True(siteProbes.ContainsKey("good-site"), + "good-site should have a ClusterClient even though bad-site's address is malformed"); + var msg = siteProbes["good-site"].ExpectMsg(); + Assert.Equal("dep1", ((DeployInstanceCommand)msg.Message).DeploymentId); + } + [Fact] public void BothContactPoints_UsedInSingleClient() {