From debe163f4d907a564598ce037d60652866aecd58 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 22 May 2026 08:23:52 -0400 Subject: [PATCH] fix(core): resolve Medium code-review finding (Core-005) Change ClusterEntry from sealed record to sealed class so TryUpdate uses reference equality for the CAS comparison. Prune now uses a read-compute-TryUpdate retry loop that restarts when a concurrent Install updates the entry between the read and the write, preventing a race that could silently drop the just-installed newest generation. Two regression tests added to PermissionTrieCacheTests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Authorization/PermissionTrieCache.cs | 43 +++++++++++++++---- .../Authorization/PermissionTrieCacheTests.cs | 35 +++++++++++++++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Core/Authorization/PermissionTrieCache.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Core/Authorization/PermissionTrieCache.cs index 9367830..be06ea4 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Core/Authorization/PermissionTrieCache.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Core/Authorization/PermissionTrieCache.cs @@ -54,26 +54,51 @@ public sealed class PermissionTrieCache /// /// Retain only the most-recent generations for a cluster. - /// No-op when there's nothing to drop. + /// No-op when there's nothing to drop. Thread-safe: uses a CAS loop with + /// (reference equality on the + /// class-typed entry) so a concurrent on the same cluster is never + /// silently overwritten. /// public void Prune(string clusterId, int keepLatest = 3) { if (keepLatest < 1) throw new ArgumentOutOfRangeException(nameof(keepLatest), keepLatest, "keepLatest must be >= 1"); - if (!_byCluster.TryGetValue(clusterId, out var entry)) return; - if (entry.Tries.Count <= keepLatest) return; - var keep = entry.Tries - .OrderByDescending(kvp => kvp.Key) - .Take(keepLatest) - .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); - _byCluster[clusterId] = new ClusterEntry(entry.Current, keep); + // CAS retry loop: read a snapshot, compute the pruned entry, atomically swap. + // Retry if another writer (Install or a concurrent Prune) updated the entry first. + while (true) + { + if (!_byCluster.TryGetValue(clusterId, out var observed)) return; + if (observed.Tries.Count <= keepLatest) return; + + var keep = observed.Tries + .OrderByDescending(kvp => kvp.Key) + .Take(keepLatest) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + + // Preserve the current pointer; if it was pruned (shouldn't happen since Current + // is always the newest generation), fall back to the newest retained entry. + var current = keep.TryGetValue(observed.Current.GenerationId, out var kept) + ? kept + : keep.OrderByDescending(kvp => kvp.Key).First().Value; + + var pruned = new ClusterEntry(current, keep); + // TryUpdate uses reference equality for ClusterEntry (class, not record) so it + // succeeds only when the stored reference is still the one we observed. + if (_byCluster.TryUpdate(clusterId, pruned, observed)) + return; + // Another thread updated the entry between our read and our write — re-read and retry. + } } /// Diagnostics counter: number of cached (cluster, generation) tries. public int CachedTrieCount => _byCluster.Values.Sum(e => e.Tries.Count); - private sealed record ClusterEntry(PermissionTrie Current, IReadOnlyDictionary Tries) + // Class (not record) so TryUpdate in Prune uses reference equality for the CAS comparison. + private sealed class ClusterEntry(PermissionTrie current, IReadOnlyDictionary tries) { + public PermissionTrie Current { get; } = current; + public IReadOnlyDictionary Tries { get; } = tries; + public static ClusterEntry FromSingle(PermissionTrie trie) => new(trie, new Dictionary { [trie.GenerationId] = trie }); diff --git a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.Tests/Authorization/PermissionTrieCacheTests.cs b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.Tests/Authorization/PermissionTrieCacheTests.cs index 1f61a7d..22119f4 100644 --- a/tests/Core/ZB.MOM.WW.OtOpcUa.Core.Tests/Authorization/PermissionTrieCacheTests.cs +++ b/tests/Core/ZB.MOM.WW.OtOpcUa.Core.Tests/Authorization/PermissionTrieCacheTests.cs @@ -101,4 +101,39 @@ public sealed class PermissionTrieCacheTests cache.CurrentGenerationId("c1").ShouldBe(1); cache.CurrentGenerationId("c2").ShouldBe(9); } + + /// + /// Core-005 regression: a concurrent Install that adds a new generation between a Prune + /// read and write must not be silently overwritten. We can't deterministically reproduce + /// a true data race in a unit test, but we can verify that Prune performed after Install + /// honours the newly-installed generation rather than clobbering it. + /// + [Fact] + public void Prune_AfterConcurrentInstall_PreservesLatestGeneration() + { + var cache = new PermissionTrieCache(); + for (var g = 1L; g <= 5; g++) cache.Install(Trie("c1", g)); + + // Simulate: Install of generation 6 races with a pending Prune(keepLatest:2). + // After Prune runs, generation 6 (the "just installed" one) must still be current. + cache.Install(Trie("c1", 6)); + cache.Prune("c1", keepLatest: 2); + + cache.CurrentGenerationId("c1").ShouldBe(6, "current must remain the newest installed generation"); + cache.GetTrie("c1", 6).ShouldNotBeNull("generation 6 must be retained as current"); + cache.GetTrie("c1", 5).ShouldNotBeNull("generation 5 retained (keepLatest=2)"); + cache.GetTrie("c1", 4).ShouldBeNull("generation 4 was pruned"); + } + + [Fact] + public void Prune_Current_Pointer_Survives_Pruning() + { + // Current is always the highest generation; verify it is not lost after prune. + var cache = new PermissionTrieCache(); + for (var g = 1L; g <= 6; g++) cache.Install(Trie("c1", g)); + + cache.Prune("c1", keepLatest: 3); + + cache.GetTrie("c1")!.GenerationId.ShouldBe(6, "current must still be the highest generation after pruning"); + } }