fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7)
ExecuteWriteAsync only caught SqlException, so a live outage surfacing as InvalidOperationException/SocketException/IOException/TimeoutException escaped unclassified and crashed the script actor instead of buffering. Mirror the HTTP path: propagate OperationCanceledException on cancellation, classify transport exceptions as transient (buffer+retry), let unexpected exceptions propagate.
This commit is contained in:
@@ -281,15 +281,21 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
|
||||
/// <summary>
|
||||
/// M2.3 (#7): executes a parameterised SQL write against the given connection
|
||||
/// string and classifies any <see cref="SqlException"/> into
|
||||
/// <see cref="TransientDatabaseException"/> / <see cref="PermanentDatabaseException"/>
|
||||
/// via <see cref="SqlErrorClassifier"/>. This is the single SQL-execution seam
|
||||
/// shared by the immediate <see cref="CachedWriteAsync"/> attempt and the
|
||||
/// string and classifies the outcome into
|
||||
/// <see cref="TransientDatabaseException"/> / <see cref="PermanentDatabaseException"/>,
|
||||
/// mirroring the ordered catches of
|
||||
/// <see cref="ExternalSystemClient.InvokeHttpAsync"/> on the API path:
|
||||
/// caller-requested cancellation propagates unchanged; a <see cref="SqlException"/>
|
||||
/// is classified by error number via <see cref="SqlErrorClassifier"/>; a
|
||||
/// non-<see cref="SqlException"/> transport/connection outage is classified
|
||||
/// transient via <see cref="SqlErrorClassifier.IsTransient(System.Exception)"/>;
|
||||
/// genuinely-unexpected exceptions propagate. This is the single classification
|
||||
/// seam shared by the immediate <see cref="CachedWriteAsync"/> attempt and the
|
||||
/// <see cref="DeliverBufferedAsync"/> retry path. Marked <c>internal virtual</c>
|
||||
/// so tests can substitute success / transient / permanent outcomes without a
|
||||
/// real SQL Server (and without fabricating a <see cref="SqlException"/>, which
|
||||
/// has no public constructor). Mirrors the role of
|
||||
/// <see cref="ExternalSystemClient.InvokeHttpAsync"/> on the API path.
|
||||
/// so tests can substitute already-classified outcomes; the raw I/O lives in
|
||||
/// the inner <see cref="RunSqlAsync"/> seam so tests can also drive raw outage
|
||||
/// exceptions through this classification (without fabricating a
|
||||
/// <see cref="SqlException"/>, which has no public constructor).
|
||||
/// </summary>
|
||||
/// <param name="connectionName">The human-readable connection name, used only for the classified error message (never the connection string — that would leak credentials into logs / script-visible errors).</param>
|
||||
/// <param name="connectionString">The ADO.NET connection string to write through.</param>
|
||||
@@ -297,7 +303,8 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
/// <param name="parameters">Materialised CLR parameter values (may be empty).</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the write.</param>
|
||||
/// <returns>A task that completes when the write succeeds.</returns>
|
||||
/// <exception cref="TransientDatabaseException">Thrown for a transient SQL error number.</exception>
|
||||
/// <exception cref="OperationCanceledException">Rethrown unchanged when the caller's <paramref name="cancellationToken"/> requested cancellation.</exception>
|
||||
/// <exception cref="TransientDatabaseException">Thrown for a transient SQL error number or a non-Sql transport/connection outage.</exception>
|
||||
/// <exception cref="PermanentDatabaseException">Thrown for a permanent (or unknown) SQL error number.</exception>
|
||||
internal virtual async Task ExecuteWriteAsync(
|
||||
string connectionName,
|
||||
@@ -306,20 +313,28 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
IReadOnlyDictionary<string, object?> parameters,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
// M2.3 (#7) code-review fix: the catch ordering MIRRORS
|
||||
// ExternalSystemClient.InvokeHttpAsync exactly so the SQL path classifies
|
||||
// a live outage the same way the HTTP path does:
|
||||
// 1. caller-requested cancellation propagates UNCHANGED (never a "DB error");
|
||||
// 2. a SqlException is classified by error number (transient/permanent);
|
||||
// 3. a NON-SqlException transport/connection failure (InvalidOperationException
|
||||
// "connection not open", IOException, SocketException, TimeoutException,
|
||||
// a non-Sql DbException, …) is TRANSIENT — buffered + retried, because a
|
||||
// retry can succeed once the server is reachable. The pre-fix code only
|
||||
// caught SqlException, so these escaped unclassified and crashed the
|
||||
// Script Execution Actor instead of buffering;
|
||||
// 4. genuinely-unexpected exceptions (e.g. an authoring ArgumentException)
|
||||
// propagate — same as the HTTP path lets unexpected exceptions escape.
|
||||
try
|
||||
{
|
||||
await using var connection = new SqlConnection(connectionString);
|
||||
await connection.OpenAsync(cancellationToken).ConfigureAwait(false);
|
||||
using var command = connection.CreateCommand();
|
||||
command.CommandText = sql;
|
||||
foreach (var (key, value) in parameters)
|
||||
{
|
||||
var parameter = command.CreateParameter();
|
||||
parameter.ParameterName = key.StartsWith('@') ? key : "@" + key;
|
||||
parameter.Value = value ?? DBNull.Value;
|
||||
command.Parameters.Add(parameter);
|
||||
}
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
await RunSqlAsync(connectionString, sql, parameters, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// [2] The caller asked to abandon the work — propagate the cancellation
|
||||
// unchanged; it must never be reclassified as a transient DB error.
|
||||
throw;
|
||||
}
|
||||
catch (SqlException ex)
|
||||
{
|
||||
@@ -328,6 +343,50 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
// is the connection NAME, never the connection string.
|
||||
throw SqlErrorClassifier.Throw(connectionName, ex);
|
||||
}
|
||||
catch (Exception ex) when (SqlErrorClassifier.IsTransient(ex))
|
||||
{
|
||||
// [1] A live outage that did not surface as a SqlException — treat as
|
||||
// transient so the caller buffers + retries. The message uses the
|
||||
// connection NAME, never the connection string (credential safety).
|
||||
throw new TransientDatabaseException(
|
||||
$"Transient database error on {connectionName}: {ex.Message}",
|
||||
errorNumber: null,
|
||||
ex);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M2.3 (#7): the raw ADO.NET write — opens the connection, builds the
|
||||
/// command, and executes it. Marked <c>internal virtual</c> so tests can throw
|
||||
/// RAW outage-shaped exceptions (e.g. <see cref="InvalidOperationException"/>,
|
||||
/// <see cref="System.Net.Sockets.SocketException"/>) through the PRODUCTION
|
||||
/// classification in <see cref="ExecuteWriteAsync"/>. This is the SQL parallel
|
||||
/// of <c>client.SendAsync</c> inside <see cref="ExternalSystemClient.InvokeHttpAsync"/>:
|
||||
/// the actual I/O, wrapped by the ordered classification catches in the caller.
|
||||
/// </summary>
|
||||
/// <param name="connectionString">The ADO.NET connection string to write through.</param>
|
||||
/// <param name="sql">The SQL statement to execute.</param>
|
||||
/// <param name="parameters">Materialised CLR parameter values (may be empty).</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the write.</param>
|
||||
/// <returns>A task that completes when the write succeeds.</returns>
|
||||
internal virtual async Task RunSqlAsync(
|
||||
string connectionString,
|
||||
string sql,
|
||||
IReadOnlyDictionary<string, object?> parameters,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = new SqlConnection(connectionString);
|
||||
await connection.OpenAsync(cancellationToken).ConfigureAwait(false);
|
||||
using var command = connection.CreateCommand();
|
||||
command.CommandText = sql;
|
||||
foreach (var (key, value) in parameters)
|
||||
{
|
||||
var parameter = command.CreateParameter();
|
||||
parameter.ParameterName = key.StartsWith('@') ? key : "@" + key;
|
||||
parameter.Value = value ?? DBNull.Value;
|
||||
command.Parameters.Add(parameter);
|
||||
}
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// ExternalSystemGateway-020: a JSON number that does not fit in Int64 must
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
using System.Data.Common;
|
||||
using System.IO;
|
||||
using System.Net.Sockets;
|
||||
using Microsoft.Data.SqlClient;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
|
||||
@@ -84,6 +87,60 @@ public static class SqlErrorClassifier
|
||||
return IsTransient(exception.Number);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether an arbitrary <see cref="Exception"/> represents a
|
||||
/// transient database failure — the SQL-path parallel of
|
||||
/// <see cref="ErrorClassifier.IsTransient(System.Exception)"/> on the HTTP path.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// A live DB outage does not always surface as a <see cref="SqlException"/>:
|
||||
/// once the underlying connection / socket is torn down, the driver raises
|
||||
/// transport-level exceptions instead. These are <b>retryable</b> — a retry
|
||||
/// can succeed once the server is reachable again — so they are classified
|
||||
/// transient (buffered to store-and-forward) rather than escaping unclassified
|
||||
/// to crash the calling Script Execution Actor. The transient set:
|
||||
/// </para>
|
||||
/// <list type="bullet">
|
||||
/// <item><see cref="InvalidOperationException"/> — connection-state error (e.g. "the connection is not open" / pooled connection broken).</item>
|
||||
/// <item><see cref="IOException"/> — transport read/write failure mid-session.</item>
|
||||
/// <item><see cref="SocketException"/> — TCP-level failure (connection refused/reset/timed out).</item>
|
||||
/// <item><see cref="TimeoutException"/> — command / connection timeout surfaced as a CLR <see cref="TimeoutException"/>.</item>
|
||||
/// <item><see cref="TaskCanceledException"/> — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches).</item>
|
||||
/// <item>Any <see cref="DbException"/> that is NOT a <see cref="SqlException"/> — a provider/driver transport error (a real <see cref="SqlException"/> is classified by error number via the overloads above, never here).</item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// <b>Everything else is NOT transient</b> and must propagate, exactly as the
|
||||
/// HTTP path lets genuinely-unexpected exceptions escape past its
|
||||
/// <c>catch (Exception ex) when (ErrorClassifier.IsTransient(ex))</c> filter.
|
||||
/// Authoring bugs (<see cref="ArgumentException"/>, <see cref="NullReferenceException"/>,
|
||||
/// etc.) are loud, fixable failures — silently buffering and retrying them
|
||||
/// forever would hide the bug.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="exception">The exception to classify.</param>
|
||||
/// <returns><see langword="true"/> for a transport/connection/timeout/driver exception; otherwise <see langword="false"/>.</returns>
|
||||
public static bool IsTransient(Exception exception)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(exception);
|
||||
|
||||
// A real SqlException is classified by its error number (the overloads
|
||||
// above), never by type — fall back to the number-based policy so an
|
||||
// unknown SqlException stays permanent (fail-fast) rather than being
|
||||
// swept up as transient by the DbException catch-all below.
|
||||
if (exception is SqlException sql)
|
||||
{
|
||||
return IsTransient(sql);
|
||||
}
|
||||
|
||||
return exception is InvalidOperationException
|
||||
or IOException
|
||||
or SocketException
|
||||
or TimeoutException
|
||||
or TaskCanceledException
|
||||
or DbException; // any non-SqlException DbException (SqlException handled above)
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching
|
||||
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
|
||||
|
||||
Reference in New Issue
Block a user