From 21856a4be715e381f495844814f153a5e9706269 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 17 May 2026 03:18:17 -0400 Subject: [PATCH] =?UTF-8?q?fix(cluster-infrastructure):=20resolve=20Cluste?= =?UTF-8?q?rInfrastructure-009,010=20=E2=80=94=20DownIfAlone=20consumption?= =?UTF-8?q?=20(via=20Host-012),=20validator=20enforces=20DownIfAlone=3Dtru?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ClusterInfrastructure/findings.md | 35 ++++++++++++++++--- .../ClusterOptionsValidator.cs | 9 +++++ .../ClusterOptionsValidatorTests.cs | 12 +++++++ 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/code-reviews/ClusterInfrastructure/findings.md b/code-reviews/ClusterInfrastructure/findings.md index 0b79077..cfc67d3 100644 --- a/code-reviews/ClusterInfrastructure/findings.md +++ b/code-reviews/ClusterInfrastructure/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-17 | | Reviewer | claude-agent | | Commit reviewed | `39d737e` | -| Open findings | 2 | +| Open findings | 0 | ## Summary @@ -516,7 +516,7 @@ inspection of `ServiceCollectionExtensions.cs` and |--|--| | Severity | Medium | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptions.cs:74` | **Description** @@ -563,7 +563,21 @@ controls nothing. **Resolution** -_Unresolved._ +Root cause verified against the source at commit `39d737e`: +`src/ScadaLink.Host/Actors/AkkaHostedService.cs:147` hard-codes `down-if-alone = on` +inside the `keep-oldest` block, and `BuildHocon` consumes every other `ClusterOptions` +field but never reads `clusterOptions.DownIfAlone`. The finding's facts hold. The fix +is correctly scoped to the **Host** module — the configuration property and its +validation legitimately live in `ScadaLink.ClusterInfrastructure` (this module), +and the per-CLAUDE.md ownership split (CI-001) places HOCON generation in +`ScadaLink.Host`. + +**Resolved** — by cross-reference, date 2026-05-17. No code change in this module: +`ClusterOptions.DownIfAlone` and its validation (CI-010) are correct and complete here. +The consumption fix — wiring `DownIfAlone` into `BuildHocon` so the property has runtime +effect — is tracked and fixed as **Host-012** in `code-reviews/Host/findings.md`, +addressed this session. CI-009 closes against that cross-reference. Module test suite +green (18 passed). ### ClusterInfrastructure-010 — Validator does not enforce `DownIfAlone = true` despite the design doc requiring it @@ -571,7 +585,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs:21-71` | **Description** @@ -602,4 +616,15 @@ should be closed as resolved alongside it. **Resolution** -_Unresolved._ +Confirmed against the source: `ClusterOptionsValidator.Validate` guarded every other +safety-critical setting but performed no check on `DownIfAlone`, so `DownIfAlone: false` +passed validation cleanly. With CI-009/Host-012 wiring the property into the HOCON this +session, an unguarded `false` would now silently produce the unsafe isolated-single-node +behaviour the design doc forbids. + +**Resolved** — fixing commit `commit pending`, date 2026-05-17. Added a `DownIfAlone` +check to `ClusterOptionsValidator.Validate` that fails when the flag is `false`, with a +message explaining the isolated-single-node-cluster hazard, consistent with how the +validator already rejects quorum split-brain strategies. Developed test-first: +`ClusterOptionsValidatorTests.DownIfAloneFalse_FailsValidation` was written first, +confirmed failing, then passing after the fix. Module test suite green (18 passed). diff --git a/src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs b/src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs index beba205..59b0c0a 100644 --- a/src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs +++ b/src/ScadaLink.ClusterInfrastructure/ClusterOptionsValidator.cs @@ -65,6 +65,15 @@ public sealed class ClusterOptionsValidator : IValidateOptions "declared unreachable before a heartbeat can arrive."); } + if (!options.DownIfAlone) + { + failures.Add( + "ClusterOptions.DownIfAlone must be true for the keep-oldest resolver " + + "(Component-ClusterInfrastructure.md → Split-Brain Resolution); with it false the " + + "oldest node can run as an isolated single-node cluster during a partition while the " + + "younger node forms its own, producing two live clusters."); + } + return failures.Count > 0 ? ValidateOptionsResult.Fail(failures) : ValidateOptionsResult.Success; diff --git a/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsValidatorTests.cs b/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsValidatorTests.cs index eda4639..616d84c 100644 --- a/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsValidatorTests.cs +++ b/tests/ScadaLink.ClusterInfrastructure.Tests/ClusterOptionsValidatorTests.cs @@ -94,6 +94,18 @@ public class ClusterOptionsValidatorTests Assert.Contains("StableAfter", result.FailureMessage); } + [Fact] + public void DownIfAloneFalse_FailsValidation() + { + var options = ValidOptions(); + options.DownIfAlone = false; + + var result = new ClusterOptionsValidator().Validate(null, options); + + Assert.True(result.Failed); + Assert.Contains("DownIfAlone", result.FailureMessage); + } + [Fact] public void Validate_AccumulatesAllFailures() {