95 lines
4.1 KiB
C#
95 lines
4.1 KiB
C#
namespace ScadaLink.Host;
|
|
|
|
/// <summary>
|
|
/// Bounded retry-with-backoff for startup preconditions.
|
|
///
|
|
/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations
|
|
/// before the host begins serving traffic. In container orchestration the database
|
|
/// and the app frequently start together, so the database may be briefly
|
|
/// unreachable. Rather than crashing the process on the first connection failure,
|
|
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
|
|
/// short outage and only fails fatally once attempts are exhausted.
|
|
///
|
|
/// Host-015: only <em>transient</em> faults are retried. The optional
|
|
/// <c>isTransient</c> predicate classifies each exception; a permanent failure
|
|
/// (e.g. a database schema-version mismatch — which no amount of waiting can fix)
|
|
/// is rethrown immediately rather than being retried for minutes before the
|
|
/// inevitable fatal exit.
|
|
/// </summary>
|
|
public static class StartupRetry
|
|
{
|
|
public static async Task ExecuteWithRetryAsync(
|
|
string operationName,
|
|
Func<Task> operation,
|
|
int maxAttempts,
|
|
TimeSpan initialDelay,
|
|
ILogger logger,
|
|
Func<Exception, bool>? isTransient = null,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
// Default: treat every exception as transient (preserves the pre-Host-015
|
|
// behaviour for callers that do not classify faults).
|
|
isTransient ??= static _ => true;
|
|
|
|
var delay = initialDelay;
|
|
for (var attempt = 1; ; attempt++)
|
|
{
|
|
cancellationToken.ThrowIfCancellationRequested();
|
|
try
|
|
{
|
|
await operation();
|
|
if (attempt > 1)
|
|
logger.LogInformation(
|
|
"Startup operation '{Operation}' succeeded on attempt {Attempt}.",
|
|
operationName, attempt);
|
|
return;
|
|
}
|
|
catch (Exception ex) when (attempt < maxAttempts && isTransient(ex))
|
|
{
|
|
logger.LogWarning(ex,
|
|
"Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " +
|
|
"retrying in {Delay}.",
|
|
operationName, attempt, maxAttempts, delay);
|
|
await Task.Delay(delay, cancellationToken);
|
|
// Exponential backoff, capped so the total wait stays bounded.
|
|
delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks));
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Transient-fault classifier for the database-migration startup step (Host-015).
|
|
/// Returns <c>true</c> only for connection-class faults that a brief wait can
|
|
/// resolve — a SQL connection/transport error or a timeout — and <c>false</c>
|
|
/// for everything else (notably schema-validation <see cref="InvalidOperationException"/>s
|
|
/// raised by <c>MigrationHelper.ApplyOrValidateMigrationsAsync</c>, which are
|
|
/// permanent and must fail fast).
|
|
/// </summary>
|
|
public static bool IsTransientDatabaseFault(Exception ex)
|
|
{
|
|
// Unwrap a single layer of aggregation so a faulted Task surfaces correctly.
|
|
if (ex is AggregateException agg && agg.InnerException != null)
|
|
ex = agg.InnerException;
|
|
|
|
if (ex is TimeoutException)
|
|
return true;
|
|
|
|
// Socket / network errors raised while opening the connection.
|
|
if (ex is System.Net.Sockets.SocketException)
|
|
return true;
|
|
|
|
// Microsoft.Data.SqlClient throws SqlException; matching by type name keeps
|
|
// the Host free of a direct SqlClient package reference. A SqlException at
|
|
// the migration stage is, in practice, a connection failure (the server is
|
|
// not yet reachable) rather than a schema fault — schema mismatches surface
|
|
// as InvalidOperationException from the migration helper.
|
|
var typeName = ex.GetType().FullName;
|
|
if (typeName != null &&
|
|
(typeName.EndsWith("SqlException", StringComparison.Ordinal) ||
|
|
typeName.EndsWith("DbException", StringComparison.Ordinal)))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
}
|