fix(host): resolve Host-012..015 — consume DownIfAlone in HOCON, sub-second timing precision, config-driven Serilog sinks, transient-only startup retry

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:33 -04:00
parent eae4077414
commit aca65e85bb
9 changed files with 395 additions and 33 deletions

View File

@@ -67,7 +67,8 @@ public class AkkaHostedService : IHostedService
// interpolated value, so a hostname, seed node or strategy containing a quote,
// backslash or whitespace cannot corrupt the configuration document.
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
transportHeartbeatSec, transportFailureSec);
_communicationOptions.TransportHeartbeatInterval,
_communicationOptions.TransportFailureThreshold);
var config = ConfigurationFactory.ParseString(hocon);
_actorSystem = ActorSystem.Create("scadalink", config);
@@ -106,13 +107,21 @@ public class AkkaHostedService : IHostedService
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
///
/// Host-012: the <c>keep-oldest down-if-alone</c> flag is emitted from
/// <see cref="ClusterOptions.DownIfAlone"/> rather than hard-coded, so the bound
/// configuration value is actually consumed.
///
/// Host-013: every duration is rendered via <see cref="DurationHocon"/> in
/// milliseconds, so sub-second cluster timing values (e.g. a 750ms heartbeat) are
/// preserved exactly instead of being rounded to whole seconds.
/// </summary>
public static string BuildHocon(
NodeOptions nodeOptions,
ClusterOptions clusterOptions,
IEnumerable<string> roles,
double transportHeartbeatSec,
double transportFailureSec)
TimeSpan transportHeartbeat,
TimeSpan transportFailure)
{
var seedNodesStr = string.Join(",",
clusterOptions.SeedNodes.Select(QuoteHocon));
@@ -132,8 +141,8 @@ akka {{
port = {nodeOptions.RemotingPort}
}}
transport-failure-detector {{
heartbeat-interval = {transportHeartbeatSec:F0}s
acceptable-heartbeat-pause = {transportFailureSec:F0}s
heartbeat-interval = {DurationHocon(transportHeartbeat)}
acceptable-heartbeat-pause = {DurationHocon(transportFailure)}
}}
}}
cluster {{
@@ -142,14 +151,14 @@ akka {{
min-nr-of-members = {clusterOptions.MinNrOfMembers}
split-brain-resolver {{
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s
stable-after = {DurationHocon(clusterOptions.StableAfter)}
keep-oldest {{
down-if-alone = on
down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}
}}
}}
failure-detector {{
heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
heartbeat-interval = {DurationHocon(clusterOptions.HeartbeatInterval)}
acceptable-heartbeat-pause = {DurationHocon(clusterOptions.FailureDetectionThreshold)}
}}
run-coordinated-shutdown-when-down = on
}}
@@ -159,6 +168,18 @@ akka {{
}}";
}
/// <summary>
/// Renders a <see cref="TimeSpan"/> as a HOCON duration in milliseconds. Akka's
/// HOCON parser accepts a <c>ms</c> suffix, so emitting whole milliseconds
/// preserves sub-second configuration exactly — a 750ms heartbeat stays 750ms
/// rather than being rounded to <c>1s</c> (or, for sub-half-second values,
/// silently collapsing to a degenerate <c>0s</c>) — Host-013.
/// </summary>
private static string DurationHocon(TimeSpan duration)
{
return $"{(long)Math.Round(duration.TotalMilliseconds)}ms";
}
/// <summary>
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
/// double quotes so the resulting token cannot break out of its string literal.

View File

@@ -8,11 +8,13 @@ namespace ScadaLink.Host;
///
/// REQ-HOST-8 / Host-011: the configured minimum level comes from
/// <c>ScadaLink:Logging:MinimumLevel</c> (bound to <see cref="LoggingOptions"/>) so an
/// operator editing that key changes the effective log level. The standard
/// <c>Serilog</c> configuration section is still read (via
/// <see cref="Serilog.Configuration.ConfigurationLoggerConfigurationExtensions"/>)
/// for sink/override customisation; the explicit <c>MinimumLevel.Is</c> below pins
/// the floor from <see cref="LoggingOptions"/>.
/// operator editing that key changes the effective log level.
///
/// REQ-HOST-8 / Host-014: the console and file sinks are read from the standard
/// <c>Serilog</c> configuration section via <c>ReadFrom.Configuration</c> — the sink
/// set, console output template, file path and rolling interval are all
/// configuration-driven (defined in <c>appsettings.json</c>), not hard-coded. The
/// explicit <c>MinimumLevel.Is</c> below pins the floor from <see cref="LoggingOptions"/>.
/// </summary>
public static class LoggerConfigurationFactory
{

View File

@@ -40,11 +40,12 @@ var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
// WP-14: Serilog structured logging.
// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
// Host-014: console and file sinks are defined in the `Serilog` configuration
// section (appsettings.json) and applied via ReadFrom.Configuration inside the
// factory — the sink set, output template, file path and rolling interval are all
// configuration-driven per REQ-HOST-8, not hard-coded here.
Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
.Build(configuration, nodeRole, siteId, nodeHostname)
.WriteTo.Console(outputTemplate:
"[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}")
.WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day)
.CreateLogger();
try
@@ -121,6 +122,8 @@ try
// Host-010: tolerate a database that is briefly unreachable at boot
// (e.g. app and DB containers starting together) with a bounded
// exponential backoff before failing fatally.
// Host-015: only connection-class (transient) faults are retried — a
// schema-version mismatch is permanent and must fail fast on attempt 1.
await StartupRetry.ExecuteWithRetryAsync(
"database-migration",
async () =>
@@ -131,7 +134,8 @@ try
},
maxAttempts: 8,
initialDelay: TimeSpan.FromSeconds(2),
migrationLogger);
migrationLogger,
isTransient: StartupRetry.IsTransientDatabaseFault);
}
// Middleware pipeline

View File

@@ -9,6 +9,12 @@ namespace ScadaLink.Host;
/// unreachable. Rather than crashing the process on the first connection failure,
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
/// short outage and only fails fatally once attempts are exhausted.
///
/// Host-015: only <em>transient</em> faults are retried. The optional
/// <c>isTransient</c> predicate classifies each exception; a permanent failure
/// (e.g. a database schema-version mismatch — which no amount of waiting can fix)
/// is rethrown immediately rather than being retried for minutes before the
/// inevitable fatal exit.
/// </summary>
public static class StartupRetry
{
@@ -18,8 +24,13 @@ public static class StartupRetry
int maxAttempts,
TimeSpan initialDelay,
ILogger logger,
Func<Exception, bool>? isTransient = null,
CancellationToken cancellationToken = default)
{
// Default: treat every exception as transient (preserves the pre-Host-015
// behaviour for callers that do not classify faults).
isTransient ??= static _ => true;
var delay = initialDelay;
for (var attempt = 1; ; attempt++)
{
@@ -33,10 +44,10 @@ public static class StartupRetry
operationName, attempt);
return;
}
catch (Exception ex) when (attempt < maxAttempts)
catch (Exception ex) when (attempt < maxAttempts && isTransient(ex))
{
logger.LogWarning(ex,
"Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
"Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " +
"retrying in {Delay}.",
operationName, attempt, maxAttempts, delay);
await Task.Delay(delay, cancellationToken);
@@ -45,4 +56,39 @@ public static class StartupRetry
}
}
}
/// <summary>
/// Transient-fault classifier for the database-migration startup step (Host-015).
/// Returns <c>true</c> only for connection-class faults that a brief wait can
/// resolve — a SQL connection/transport error or a timeout — and <c>false</c>
/// for everything else (notably schema-validation <see cref="InvalidOperationException"/>s
/// raised by <c>MigrationHelper.ApplyOrValidateMigrationsAsync</c>, which are
/// permanent and must fail fast).
/// </summary>
public static bool IsTransientDatabaseFault(Exception ex)
{
// Unwrap a single layer of aggregation so a faulted Task surfaces correctly.
if (ex is AggregateException agg && agg.InnerException != null)
ex = agg.InnerException;
if (ex is TimeoutException)
return true;
// Socket / network errors raised while opening the connection.
if (ex is System.Net.Sockets.SocketException)
return true;
// Microsoft.Data.SqlClient throws SqlException; matching by type name keeps
// the Host free of a direct SqlClient package reference. A SqlException at
// the migration stage is, in practice, a connection failure (the server is
// not yet reachable) rather than a schema fault — schema mismatches surface
// as InvalidOperationException from the migration helper.
var typeName = ex.GetType().FullName;
if (typeName != null &&
(typeName.EndsWith("SqlException", StringComparison.Ordinal) ||
typeName.EndsWith("DbException", StringComparison.Ordinal)))
return true;
return false;
}
}

View File

@@ -3,5 +3,26 @@
"LogLevel": {
"Default": "Information"
}
},
"Serilog": {
"Using": [
"Serilog.Sinks.Console",
"Serilog.Sinks.File"
],
"WriteTo": [
{
"Name": "Console",
"Args": {
"outputTemplate": "[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}"
}
},
{
"Name": "File",
"Args": {
"path": "logs/scadalink-.log",
"rollingInterval": "Day"
}
}
]
}
}