fix(host): resolve Host-012..015 — consume DownIfAlone in HOCON, sub-second timing precision, config-driven Serilog sinks, transient-only startup retry

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:33 -04:00
parent eae4077414
commit aca65e85bb
9 changed files with 395 additions and 33 deletions

View File

@@ -9,6 +9,12 @@ namespace ScadaLink.Host;
/// unreachable. Rather than crashing the process on the first connection failure,
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
/// short outage and only fails fatally once attempts are exhausted.
///
/// Host-015: only <em>transient</em> faults are retried. The optional
/// <c>isTransient</c> predicate classifies each exception; a permanent failure
/// (e.g. a database schema-version mismatch — which no amount of waiting can fix)
/// is rethrown immediately rather than being retried for minutes before the
/// inevitable fatal exit.
/// </summary>
public static class StartupRetry
{
@@ -18,8 +24,13 @@ public static class StartupRetry
int maxAttempts,
TimeSpan initialDelay,
ILogger logger,
Func<Exception, bool>? isTransient = null,
CancellationToken cancellationToken = default)
{
// Default: treat every exception as transient (preserves the pre-Host-015
// behaviour for callers that do not classify faults).
isTransient ??= static _ => true;
var delay = initialDelay;
for (var attempt = 1; ; attempt++)
{
@@ -33,10 +44,10 @@ public static class StartupRetry
operationName, attempt);
return;
}
catch (Exception ex) when (attempt < maxAttempts)
catch (Exception ex) when (attempt < maxAttempts && isTransient(ex))
{
logger.LogWarning(ex,
"Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
"Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " +
"retrying in {Delay}.",
operationName, attempt, maxAttempts, delay);
await Task.Delay(delay, cancellationToken);
@@ -45,4 +56,39 @@ public static class StartupRetry
}
}
}
/// <summary>
/// Transient-fault classifier for the database-migration startup step (Host-015).
/// Returns <c>true</c> only for connection-class faults that a brief wait can
/// resolve — a SQL connection/transport error or a timeout — and <c>false</c>
/// for everything else (notably schema-validation <see cref="InvalidOperationException"/>s
/// raised by <c>MigrationHelper.ApplyOrValidateMigrationsAsync</c>, which are
/// permanent and must fail fast).
/// </summary>
public static bool IsTransientDatabaseFault(Exception ex)
{
// Unwrap a single layer of aggregation so a faulted Task surfaces correctly.
if (ex is AggregateException agg && agg.InnerException != null)
ex = agg.InnerException;
if (ex is TimeoutException)
return true;
// Socket / network errors raised while opening the connection.
if (ex is System.Net.Sockets.SocketException)
return true;
// Microsoft.Data.SqlClient throws SqlException; matching by type name keeps
// the Host free of a direct SqlClient package reference. A SqlException at
// the migration stage is, in practice, a connection failure (the server is
// not yet reachable) rather than a schema fault — schema mismatches surface
// as InvalidOperationException from the migration helper.
var typeName = ex.GetType().FullName;
if (typeName != null &&
(typeName.EndsWith("SqlException", StringComparison.Ordinal) ||
typeName.EndsWith("DbException", StringComparison.Ordinal)))
return true;
return false;
}
}