fix(cluster-infrastructure): resolve ClusterInfrastructure-002..006 — options validation, DI registration, down-if-alone

This commit is contained in:
Joseph Doherty
2026-05-16 20:58:03 -04:00
parent 71b90ba499
commit dba1a1b25f
8 changed files with 441 additions and 12 deletions

View File

@@ -1,11 +1,75 @@
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// Cluster configuration model, bound from the <c>ScadaLink:Cluster</c> section
/// of <c>appsettings.json</c> via the Options pattern.
/// <para>
/// This project owns the cluster <em>configuration contract</em>. The actual
/// Akka.NET bootstrap — building the HOCON from these values, starting the
/// <c>ActorSystem</c>, configuring the split-brain resolver and wiring
/// <c>CoordinatedShutdown</c> — lives in <c>ScadaLink.Host</c>
/// (see <c>Component-ClusterInfrastructure.md</c> → "Implementation Note — Code Placement").
/// </para>
/// <para>
/// Node-identity settings (remoting hostname/port, cluster role, site identifier,
/// gRPC port) are deliberately <em>not</em> here — they are owned by
/// <c>ScadaLink.Host.NodeOptions</c> (<c>ScadaLink:Node</c> section). Local SQLite
/// storage paths are owned by the database / store-and-forward options. This class
/// holds only the cluster-formation and failure-detection settings shared by every node.
/// </para>
/// </summary>
public class ClusterOptions
{
/// <summary>
/// The <c>appsettings.json</c> section name this options class binds from.
/// Single source of truth so binding sites do not hard-code the magic string.
/// </summary>
public const string SectionName = "ScadaLink:Cluster";
/// <summary>
/// Akka.NET cluster seed nodes. Both nodes are seed nodes — each node lists
/// itself and its partner — so either can start first and form the cluster.
/// Must contain at least one entry.
/// </summary>
public List<string> SeedNodes { get; set; } = new();
/// <summary>
/// Split-brain resolver strategy. Must be <c>keep-oldest</c> for the two-node
/// clusters ScadaLink uses: quorum strategies (<c>keep-majority</c>,
/// <c>static-quorum</c>) cannot distinguish a crash from a partition with only
/// two nodes and would shut down the whole cluster.
/// </summary>
public string SplitBrainResolverStrategy { get; set; } = "keep-oldest";
/// <summary>
/// Time the cluster membership must remain stable before the split-brain
/// resolver acts to down unreachable nodes. Must be positive. Default 15s.
/// </summary>
public TimeSpan StableAfter { get; set; } = TimeSpan.FromSeconds(15);
/// <summary>
/// Frequency of cluster failure-detector heartbeat messages between nodes.
/// Must be well below <see cref="FailureDetectionThreshold"/>. Default 2s.
/// </summary>
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(2);
/// <summary>
/// Time without a heartbeat before a node is considered unreachable
/// (Akka's <c>acceptable-heartbeat-pause</c>). Default 10s.
/// </summary>
public TimeSpan FailureDetectionThreshold { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>
/// Akka's <c>min-nr-of-members</c>. Must be <c>1</c>: after failover only one
/// node runs, and a value of <c>2</c> blocks the cluster singleton (Site Runtime
/// Deployment Manager) — and therefore all data collection — indefinitely.
/// </summary>
public int MinNrOfMembers { get; set; } = 1;
/// <summary>
/// The keep-oldest resolver's <c>down-if-alone</c> flag. When <c>true</c> (the
/// design-doc requirement), the oldest node downs itself if it finds it has no
/// other reachable members, rather than running as an isolated single-node cluster.
/// </summary>
public bool DownIfAlone { get; set; } = true;
}

View File

@@ -0,0 +1,72 @@
using Microsoft.Extensions.Options;
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// CI-004: Validates <see cref="ClusterOptions"/> at startup. The values it
/// guards carry cluster-wide consequences — the design doc
/// (<c>Component-ClusterInfrastructure.md</c>) is emphatic that misconfiguring
/// them produces a total cluster shutdown or an indefinitely blocked singleton.
/// Registered with <c>ValidateOnStart()</c> so a bad <c>appsettings.json</c>
/// fails fast at boot rather than failing far from the cause.
/// </summary>
public sealed class ClusterOptionsValidator : IValidateOptions<ClusterOptions>
{
/// <summary>Split-brain resolver strategies safe for ScadaLink's two-node clusters.</summary>
private static readonly HashSet<string> AllowedStrategies = new(StringComparer.OrdinalIgnoreCase)
{
"keep-oldest"
};
public ValidateOptionsResult Validate(string? name, ClusterOptions options)
{
var failures = new List<string>();
if (options.SeedNodes is null || options.SeedNodes.Count == 0)
{
failures.Add("ClusterOptions.SeedNodes must contain at least one seed node.");
}
if (string.IsNullOrWhiteSpace(options.SplitBrainResolverStrategy)
|| !AllowedStrategies.Contains(options.SplitBrainResolverStrategy))
{
failures.Add(
$"ClusterOptions.SplitBrainResolverStrategy must be 'keep-oldest' for a two-node cluster; " +
$"'{options.SplitBrainResolverStrategy}' would risk a total cluster shutdown on a partition.");
}
if (options.MinNrOfMembers != 1)
{
failures.Add(
$"ClusterOptions.MinNrOfMembers must be 1 (was {options.MinNrOfMembers}); " +
"any other value blocks the cluster singleton after failover and halts all data collection.");
}
if (options.StableAfter <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.StableAfter must be a positive duration.");
}
if (options.HeartbeatInterval <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.HeartbeatInterval must be a positive duration.");
}
if (options.FailureDetectionThreshold <= TimeSpan.Zero)
{
failures.Add("ClusterOptions.FailureDetectionThreshold must be a positive duration.");
}
if (options.HeartbeatInterval >= options.FailureDetectionThreshold)
{
failures.Add(
$"ClusterOptions.HeartbeatInterval ({options.HeartbeatInterval}) must be well below " +
$"FailureDetectionThreshold ({options.FailureDetectionThreshold}); otherwise nodes are " +
"declared unreachable before a heartbeat can arrive.");
}
return failures.Count > 0
? ValidateOptionsResult.Fail(failures)
: ValidateOptionsResult.Success;
}
}

View File

@@ -1,18 +1,47 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Options;
namespace ScadaLink.ClusterInfrastructure;
/// <summary>
/// DI registration for the Cluster Infrastructure component.
/// </summary>
public static class ServiceCollectionExtensions
{
/// <summary>
/// Registers the Cluster Infrastructure services. This component owns the
/// cluster <em>configuration contract</em> (<see cref="ClusterOptions"/>); the
/// Akka.NET bootstrap itself lives in <c>ScadaLink.Host</c>
/// (see <c>Component-ClusterInfrastructure.md</c>).
/// <para>
/// Registering the <see cref="ClusterOptionsValidator"/> means a misconfigured
/// <c>ScadaLink:Cluster</c> section (e.g. <c>MinNrOfMembers: 2</c> or a quorum
/// split-brain strategy) throws an <see cref="OptionsValidationException"/> the
/// first time <see cref="IOptions{TOptions}"/> is resolved, rather than booting
/// into a broken cluster.
/// </para>
/// </summary>
public static IServiceCollection AddClusterInfrastructure(this IServiceCollection services)
{
// Phase 0: skeleton only
services.TryAddEnumerable(
ServiceDescriptor.Singleton<IValidateOptions<ClusterOptions>, ClusterOptionsValidator>());
return services;
}
/// <summary>
/// Reserved for cluster-infrastructure actor registration. This component does
/// not register any actors — the Akka.NET bootstrap and actor wiring live in
/// <c>ScadaLink.Host</c>. The method throws rather than silently returning
/// success so that any caller assuming this component registers actors fails
/// fast with a clear cause instead of failing later, far from here.
/// </summary>
/// <exception cref="NotImplementedException">Always thrown.</exception>
public static IServiceCollection AddClusterInfrastructureActors(this IServiceCollection services)
{
// Phase 0: placeholder for Akka actor registration
return services;
throw new NotImplementedException(
"ScadaLink.ClusterInfrastructure registers no actors. The Akka.NET actor system " +
"bootstrap and all cluster actor registration live in ScadaLink.Host " +
"(AkkaHostedService). Do not call AddClusterInfrastructureActors().");
}
}