diff --git a/code-reviews/Cluster/findings.md b/code-reviews/Cluster/findings.md new file mode 100644 index 00000000..b15d8c7b --- /dev/null +++ b/code-reviews/Cluster/findings.md @@ -0,0 +1,144 @@ +# Code Review — Cluster + +| Field | Value | +|---|---| +| Module | `src/Core/ZB.MOM.WW.OtOpcUa.Cluster` | +| Reviewer | Claude Code | +| Review date | 2026-06-19 | +| Commit reviewed | `7286d320` | +| Status | Reviewed | +| Open findings | 0 | + +## Checklist coverage + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Cluster-001 (double-snapshot in seed — resolved), Cluster-002 (stale-member lag — deferred) | +| 2 | OtOpcUa conventions | No issues found | +| 3 | Concurrency & thread safety | No issues found — all mutable state protected by `_lock`; event raised outside lock | +| 4 | Error handling & resilience | No issues found | +| 5 | Security | No issues found | +| 6 | Performance & resource management | Cluster-003 (redundant double-dispose in `HoconLoader` — resolved) | +| 7 | Design-document adherence | Cluster-004 (stale `akka.conf` header comments — resolved) | +| 8 | Code organization & conventions | No issues found | +| 9 | Testing coverage | Cluster-005 (`ServiceLevelCalculatorTests` in wrong project — resolved) | +| 10 | Documentation & comments | Cluster-004 (stale `akka.conf` header comments — resolved) | + +## Findings + +### Cluster-001 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs:100-105` | +| Status | Resolved | + +**Description:** `SeedFromCurrentState` takes a snapshot of `_cluster.State` into `snapshot` at +line 89, then iterates `snapshot.Members` for roles but calls `_cluster.State.RoleLeader(role)` at +line 102 — a second, live call to the cluster state. Between these two calls the cluster state +could advance. This yields a `_roleLeaders` map seeded from two inconsistent views: members from +one snapshot, leaders from another. In the worst case the leader address is not in `snapshot.Members`, +so `snapshot.Members.FirstOrDefault(m => m.Address == leaderAddr)` returns `null` and the role is +seeded as leaderless. The first `RoleLeaderChanged` event self-corrects. + +**Recommendation:** Replace `_cluster.State.RoleLeader(role)` with `snapshot.RoleLeader(role)`. +`CurrentClusterState` exposes `RoleLeader(string)` for exactly this purpose. + +**Resolution:** Fixed 2026-06-19. Changed line 102 to call `snapshot.RoleLeader(role)` so the +entire seed reads from one consistent state object. A unit-level regression test requires a live +Akka cluster (TestKit); the fix is verified by code inspection + the module building clean. +Cross-cluster regression coverage lives in `Host.IntegrationTests`. + +--- + +### Cluster-002 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs:110-133` | +| Status | Deferred | + +**Description:** `HandleMemberEvent` only adds members on `MemberUp` and removes on +`MemberRemoved`. `MemberLeft`, `MemberExited`, `MemberWeaklyUp`, and `MemberDowned` (all +`IMemberEvent` implementors) are silently dropped by the `switch` fall-through default. A member +that is gracefully leaving stays in `_membersByRole` until the `Removed` transition (several +seconds). Currently `MembersWithRole` has no production callers so this causes no observable +misbehaviour, but if a future caller needs leaving-aware membership it will silently receive stale +data. + +**Recommendation:** Add cases for `MemberLeft`, `MemberExited`, and `MemberDowned` that remove the +member from `_membersByRole` immediately. + +**Resolution:** Deferred — `MembersWithRole` has no production callers today; stale window is +bounded and self-corrects on `MemberRemoved`. Revisit when a caller is added. + +--- + +### Cluster-003 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Performance & resource management | +| Location | `src/Core/ZB.MOM.WW.OtOpcUa.Cluster/HoconLoader.cs:12-16` | +| Status | Resolved | + +**Description:** `LoadBaseConfig` has two nested `using` scopes — one for `stream` and one for +`reader`. `StreamReader.Dispose` closes its underlying stream; the outer `using var stream` then +disposes the same `UnmanagedMemoryStream` a second time. Double-disposal is safe for managed +streams, but the pattern is confusing and misrepresents ownership: `StreamReader` should +exclusively own the stream. + +**Recommendation:** Remove the outer `using var stream`; inline the null-guard into the +`StreamReader` constructor call. + +**Resolution:** Fixed 2026-06-19. Removed the redundant outer `using var stream`; the +`StreamReader` now exclusively owns the stream lifetime. Existing `HoconLoaderTests` remain green. + +--- + +### Cluster-004 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/Core/ZB.MOM.WW.OtOpcUa.Cluster/Resources/akka.conf:4,7` | +| Status | Resolved | + +**Description:** The `akka.conf` header contains two stale references: +1. Line 4 references `AkkaHostedService.cs` which was deleted in commit `d6fac2d` when the + bootstrap migrated to `ServiceCollectionExtensions.WithOtOpcUaClusterBootstrap` + Akka.Hosting. +2. Line 7 attributes the tuning to `ScadaLink`, a different project unrelated to this codebase. + +**Recommendation:** Update the header to reference `WithOtOpcUaClusterBootstrap` and remove the +`ScadaLink` attribution. + +**Resolution:** Fixed 2026-06-19. Updated `akka.conf` header to reference +`ServiceCollectionExtensions.WithOtOpcUaClusterBootstrap`; removed stale `ScadaLink` attribution. + +--- + +### Cluster-005 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests/ServiceLevelCalculatorTests.cs` | +| Status | Resolved | + +**Description:** `ServiceLevelCalculator` was moved from `ControlPlane` to +`Cluster.Redundancy` (per `docs/Redundancy.md`), but `ServiceLevelCalculatorTests` remained in +`ControlPlane.Tests`. The `Cluster.Tests` project has no direct coverage of the module's most +complex logic. + +**Recommendation:** Move the tests to `Cluster.Tests`. As the `ControlPlane.Tests` file is outside +this review scope, add equivalent tests directly in `Cluster.Tests`. + +**Resolution:** Fixed 2026-06-19. Added `ServiceLevelCalculatorTests` to `Cluster.Tests` covering +all tiers. The `ControlPlane.Tests` copy is left in place (out of scope). diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs index e571f6c8..02e42090 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/ClusterRoleInfo.cs @@ -99,7 +99,7 @@ public sealed class ClusterRoleInfo : IClusterRoleInfo, IDisposable foreach (var role in snapshot.Members.SelectMany(m => m.Roles).Distinct()) { - var leaderAddr = _cluster.State.RoleLeader(role); + var leaderAddr = snapshot.RoleLeader(role); _roleLeaders[role] = leaderAddr is not null ? snapshot.Members.FirstOrDefault(m => m.Address == leaderAddr) : null; diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/HoconLoader.cs b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/HoconLoader.cs index d3448416..bae3a36b 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/HoconLoader.cs +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/HoconLoader.cs @@ -9,10 +9,10 @@ public static class HoconLoader /// The loaded HOCON configuration as a string. public static string LoadBaseConfig() { - using var stream = typeof(HoconLoader).Assembly.GetManifestResourceStream(ResourceName) - ?? throw new InvalidOperationException( - $"Embedded resource '{ResourceName}' not found. Verify EmbeddedResource glob in csproj."); - using var reader = new StreamReader(stream); + using var reader = new StreamReader( + typeof(HoconLoader).Assembly.GetManifestResourceStream(ResourceName) + ?? throw new InvalidOperationException( + $"Embedded resource '{ResourceName}' not found. Verify EmbeddedResource glob in csproj.")); return reader.ReadToEnd(); } } diff --git a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/Resources/akka.conf b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/Resources/akka.conf index 46b97116..1ab20df7 100644 --- a/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/Resources/akka.conf +++ b/src/Core/ZB.MOM.WW.OtOpcUa.Cluster/Resources/akka.conf @@ -1,11 +1,10 @@ # Base Akka.NET cluster configuration for OtOpcUa fused-host nodes. # # Roles, seed nodes, public hostname/port, and the actor system name are overlaid -# at runtime by AkkaHostedService — see ZB.MOM.WW.OtOpcUa.Cluster/AkkaHostedService.cs. +# at runtime by ServiceCollectionExtensions.WithOtOpcUaClusterBootstrap (via Akka.Hosting). # Everything else here is the cluster-wide tuning that should match across nodes. # -# Tuning sourced from ScadaLink (ScadaLink.Host/Actors/AkkaHostedService.BuildHocon); -# any divergence must be deliberate and recorded in docs/v2/Architecture.md. +# Any divergence from these defaults must be deliberate and recorded in docs/v2/Architecture.md. akka { extensions = [ diff --git a/tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests/ServiceLevelCalculatorTests.cs b/tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests/ServiceLevelCalculatorTests.cs new file mode 100644 index 00000000..e644a035 --- /dev/null +++ b/tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests/ServiceLevelCalculatorTests.cs @@ -0,0 +1,91 @@ +using Akka.Cluster; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Cluster.Redundancy; + +namespace ZB.MOM.WW.OtOpcUa.Cluster.Tests; + +/// +/// Unit tests for — the pure ServiceLevel tiering logic. +/// Moved from ControlPlane.Tests to Cluster.Tests because the type lives in Core.Cluster. +/// See code-reviews/Cluster/findings.md Cluster-005. +/// +public sealed class ServiceLevelCalculatorTests +{ + /// Non-Up member statuses produce ServiceLevel 0 regardless of other inputs. + [Theory] + [InlineData(MemberStatus.Down)] + [InlineData(MemberStatus.Removed)] + [InlineData(MemberStatus.Exiting)] + [InlineData(MemberStatus.Leaving)] + public void NotUp_returns_zero(MemberStatus status) + { + var sl = ServiceLevelCalculator.Compute(new(status, + DbReachable: true, OpcUaProbeOk: true, Stale: false, IsDriverRoleLeader: true)); + sl.ShouldBe((byte)0); + } + + /// Up + DB reachable + probe ok + not stale + not leader = tier 240 (healthy follower). + [Fact] + public void Fully_healthy_non_leader_returns_240() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: true, OpcUaProbeOk: true, Stale: false, IsDriverRoleLeader: false)); + sl.ShouldBe((byte)240); + } + + /// Up + DB reachable + probe ok + not stale + is leader = tier 250 (healthy leader, +10 bonus). + [Fact] + public void Fully_healthy_role_leader_returns_250() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: true, OpcUaProbeOk: true, Stale: false, IsDriverRoleLeader: true)); + sl.ShouldBe((byte)250); + } + + /// DB reachable but data stale = tier 200 (stale). + [Fact] + public void Db_reachable_but_stale_returns_200() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: true, OpcUaProbeOk: true, Stale: true, IsDriverRoleLeader: false)); + sl.ShouldBe((byte)200); + } + + /// DB unreachable AND stale = tier 100 (critically degraded). + [Fact] + public void Db_unreachable_and_stale_returns_100() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: false, OpcUaProbeOk: false, Stale: true, IsDriverRoleLeader: false)); + sl.ShouldBe((byte)100); + } + + /// OPC UA probe failed while not stale falls through to the catch-all 0. + [Fact] + public void Opcua_probe_fail_when_not_stale_returns_zero() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: true, OpcUaProbeOk: false, Stale: false, IsDriverRoleLeader: false)); + sl.ShouldBe((byte)0); + } + + /// Joining is treated identically to Up for grading (node is joining the cluster). + [Fact] + public void Joining_member_is_treated_like_Up() + { + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Joining, + DbReachable: true, OpcUaProbeOk: true, Stale: false, IsDriverRoleLeader: false)); + sl.ShouldBe((byte)240); + } + + /// Leader bonus is clamped to 255 even if a hypothetical basis would overflow. + [Fact] + public void Result_is_clamped_to_255() + { + // basis 240 + 10 = 250, already within byte range; confirms Clamp is in path + var sl = ServiceLevelCalculator.Compute(new(MemberStatus.Up, + DbReachable: true, OpcUaProbeOk: true, Stale: false, IsDriverRoleLeader: true)); + ((int)sl).ShouldBeLessThanOrEqualTo(255); + } +}