namespace ScadaLink.Host; /// /// Bounded retry-with-backoff for startup preconditions. /// /// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations /// before the host begins serving traffic. In container orchestration the database /// and the app frequently start together, so the database may be briefly /// unreachable. Rather than crashing the process on the first connection failure, /// the migration step is wrapped in this bounded exponential backoff: it tolerates a /// short outage and only fails fatally once attempts are exhausted. /// /// Host-015: only transient faults are retried. The optional /// isTransient predicate classifies each exception; a permanent failure /// (e.g. a database schema-version mismatch — which no amount of waiting can fix) /// is rethrown immediately rather than being retried for minutes before the /// inevitable fatal exit. /// public static class StartupRetry { public static async Task ExecuteWithRetryAsync( string operationName, Func operation, int maxAttempts, TimeSpan initialDelay, ILogger logger, Func? isTransient = null, CancellationToken cancellationToken = default) { // Default: treat every exception as transient (preserves the pre-Host-015 // behaviour for callers that do not classify faults). isTransient ??= static _ => true; var delay = initialDelay; for (var attempt = 1; ; attempt++) { cancellationToken.ThrowIfCancellationRequested(); try { await operation(); if (attempt > 1) logger.LogInformation( "Startup operation '{Operation}' succeeded on attempt {Attempt}.", operationName, attempt); return; } catch (Exception ex) when (attempt < maxAttempts && isTransient(ex)) { logger.LogWarning(ex, "Startup operation '{Operation}' failed (transient) on attempt {Attempt}/{MaxAttempts}; " + "retrying in {Delay}.", operationName, attempt, maxAttempts, delay); await Task.Delay(delay, cancellationToken); // Exponential backoff, capped so the total wait stays bounded. delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks)); } } } /// /// Transient-fault classifier for the database-migration startup step (Host-015). /// Returns true only for connection-class faults that a brief wait can /// resolve — a SQL connection/transport error or a timeout — and false /// for everything else (notably schema-validation s /// raised by MigrationHelper.ApplyOrValidateMigrationsAsync, which are /// permanent and must fail fast). /// public static bool IsTransientDatabaseFault(Exception ex) { // Unwrap a single layer of aggregation so a faulted Task surfaces correctly. if (ex is AggregateException agg && agg.InnerException != null) ex = agg.InnerException; if (ex is TimeoutException) return true; // Socket / network errors raised while opening the connection. if (ex is System.Net.Sockets.SocketException) return true; // Microsoft.Data.SqlClient throws SqlException; matching by type name keeps // the Host free of a direct SqlClient package reference. A SqlException at // the migration stage is, in practice, a connection failure (the server is // not yet reachable) rather than a schema fault — schema mismatches surface // as InvalidOperationException from the migration helper. var typeName = ex.GetType().FullName; if (typeName != null && (typeName.EndsWith("SqlException", StringComparison.Ordinal) || typeName.EndsWith("DbException", StringComparison.Ordinal))) return true; return false; } }