fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7)

ExecuteWriteAsync only caught SqlException, so a live outage surfacing as
InvalidOperationException/SocketException/IOException/TimeoutException escaped
unclassified and crashed the script actor instead of buffering. Mirror the HTTP
path: propagate OperationCanceledException on cancellation, classify transport
exceptions as transient (buffer+retry), let unexpected exceptions propagate.
This commit is contained in:
Joseph Doherty
2026-06-15 14:03:25 -04:00
parent d05270640d
commit de375ff7ea
4 changed files with 335 additions and 21 deletions
@@ -281,15 +281,21 @@ public class DatabaseGateway : IDatabaseGateway
/// <summary> /// <summary>
/// M2.3 (#7): executes a parameterised SQL write against the given connection /// M2.3 (#7): executes a parameterised SQL write against the given connection
/// string and classifies any <see cref="SqlException"/> into /// string and classifies the outcome into
/// <see cref="TransientDatabaseException"/> / <see cref="PermanentDatabaseException"/> /// <see cref="TransientDatabaseException"/> / <see cref="PermanentDatabaseException"/>,
/// via <see cref="SqlErrorClassifier"/>. This is the single SQL-execution seam /// mirroring the ordered catches of
/// shared by the immediate <see cref="CachedWriteAsync"/> attempt and the /// <see cref="ExternalSystemClient.InvokeHttpAsync"/> on the API path:
/// caller-requested cancellation propagates unchanged; a <see cref="SqlException"/>
/// is classified by error number via <see cref="SqlErrorClassifier"/>; a
/// non-<see cref="SqlException"/> transport/connection outage is classified
/// transient via <see cref="SqlErrorClassifier.IsTransient(System.Exception)"/>;
/// genuinely-unexpected exceptions propagate. This is the single classification
/// seam shared by the immediate <see cref="CachedWriteAsync"/> attempt and the
/// <see cref="DeliverBufferedAsync"/> retry path. Marked <c>internal virtual</c> /// <see cref="DeliverBufferedAsync"/> retry path. Marked <c>internal virtual</c>
/// so tests can substitute success / transient / permanent outcomes without a /// so tests can substitute already-classified outcomes; the raw I/O lives in
/// real SQL Server (and without fabricating a <see cref="SqlException"/>, which /// the inner <see cref="RunSqlAsync"/> seam so tests can also drive raw outage
/// has no public constructor). Mirrors the role of /// exceptions through this classification (without fabricating a
/// <see cref="ExternalSystemClient.InvokeHttpAsync"/> on the API path. /// <see cref="SqlException"/>, which has no public constructor).
/// </summary> /// </summary>
/// <param name="connectionName">The human-readable connection name, used only for the classified error message (never the connection string — that would leak credentials into logs / script-visible errors).</param> /// <param name="connectionName">The human-readable connection name, used only for the classified error message (never the connection string — that would leak credentials into logs / script-visible errors).</param>
/// <param name="connectionString">The ADO.NET connection string to write through.</param> /// <param name="connectionString">The ADO.NET connection string to write through.</param>
@@ -297,7 +303,8 @@ public class DatabaseGateway : IDatabaseGateway
/// <param name="parameters">Materialised CLR parameter values (may be empty).</param> /// <param name="parameters">Materialised CLR parameter values (may be empty).</param>
/// <param name="cancellationToken">Cancellation token for the write.</param> /// <param name="cancellationToken">Cancellation token for the write.</param>
/// <returns>A task that completes when the write succeeds.</returns> /// <returns>A task that completes when the write succeeds.</returns>
/// <exception cref="TransientDatabaseException">Thrown for a transient SQL error number.</exception> /// <exception cref="OperationCanceledException">Rethrown unchanged when the caller's <paramref name="cancellationToken"/> requested cancellation.</exception>
/// <exception cref="TransientDatabaseException">Thrown for a transient SQL error number or a non-Sql transport/connection outage.</exception>
/// <exception cref="PermanentDatabaseException">Thrown for a permanent (or unknown) SQL error number.</exception> /// <exception cref="PermanentDatabaseException">Thrown for a permanent (or unknown) SQL error number.</exception>
internal virtual async Task ExecuteWriteAsync( internal virtual async Task ExecuteWriteAsync(
string connectionName, string connectionName,
@@ -306,20 +313,28 @@ public class DatabaseGateway : IDatabaseGateway
IReadOnlyDictionary<string, object?> parameters, IReadOnlyDictionary<string, object?> parameters,
CancellationToken cancellationToken) CancellationToken cancellationToken)
{ {
// M2.3 (#7) code-review fix: the catch ordering MIRRORS
// ExternalSystemClient.InvokeHttpAsync exactly so the SQL path classifies
// a live outage the same way the HTTP path does:
// 1. caller-requested cancellation propagates UNCHANGED (never a "DB error");
// 2. a SqlException is classified by error number (transient/permanent);
// 3. a NON-SqlException transport/connection failure (InvalidOperationException
// "connection not open", IOException, SocketException, TimeoutException,
// a non-Sql DbException, …) is TRANSIENT — buffered + retried, because a
// retry can succeed once the server is reachable. The pre-fix code only
// caught SqlException, so these escaped unclassified and crashed the
// Script Execution Actor instead of buffering;
// 4. genuinely-unexpected exceptions (e.g. an authoring ArgumentException)
// propagate — same as the HTTP path lets unexpected exceptions escape.
try try
{ {
await using var connection = new SqlConnection(connectionString); await RunSqlAsync(connectionString, sql, parameters, cancellationToken).ConfigureAwait(false);
await connection.OpenAsync(cancellationToken).ConfigureAwait(false); }
using var command = connection.CreateCommand(); catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
command.CommandText = sql; {
foreach (var (key, value) in parameters) // [2] The caller asked to abandon the work — propagate the cancellation
{ // unchanged; it must never be reclassified as a transient DB error.
var parameter = command.CreateParameter(); throw;
parameter.ParameterName = key.StartsWith('@') ? key : "@" + key;
parameter.Value = value ?? DBNull.Value;
command.Parameters.Add(parameter);
}
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
} }
catch (SqlException ex) catch (SqlException ex)
{ {
@@ -328,6 +343,50 @@ public class DatabaseGateway : IDatabaseGateway
// is the connection NAME, never the connection string. // is the connection NAME, never the connection string.
throw SqlErrorClassifier.Throw(connectionName, ex); throw SqlErrorClassifier.Throw(connectionName, ex);
} }
catch (Exception ex) when (SqlErrorClassifier.IsTransient(ex))
{
// [1] A live outage that did not surface as a SqlException — treat as
// transient so the caller buffers + retries. The message uses the
// connection NAME, never the connection string (credential safety).
throw new TransientDatabaseException(
$"Transient database error on {connectionName}: {ex.Message}",
errorNumber: null,
ex);
}
}
/// <summary>
/// M2.3 (#7): the raw ADO.NET write — opens the connection, builds the
/// command, and executes it. Marked <c>internal virtual</c> so tests can throw
/// RAW outage-shaped exceptions (e.g. <see cref="InvalidOperationException"/>,
/// <see cref="System.Net.Sockets.SocketException"/>) through the PRODUCTION
/// classification in <see cref="ExecuteWriteAsync"/>. This is the SQL parallel
/// of <c>client.SendAsync</c> inside <see cref="ExternalSystemClient.InvokeHttpAsync"/>:
/// the actual I/O, wrapped by the ordered classification catches in the caller.
/// </summary>
/// <param name="connectionString">The ADO.NET connection string to write through.</param>
/// <param name="sql">The SQL statement to execute.</param>
/// <param name="parameters">Materialised CLR parameter values (may be empty).</param>
/// <param name="cancellationToken">Cancellation token for the write.</param>
/// <returns>A task that completes when the write succeeds.</returns>
internal virtual async Task RunSqlAsync(
string connectionString,
string sql,
IReadOnlyDictionary<string, object?> parameters,
CancellationToken cancellationToken)
{
await using var connection = new SqlConnection(connectionString);
await connection.OpenAsync(cancellationToken).ConfigureAwait(false);
using var command = connection.CreateCommand();
command.CommandText = sql;
foreach (var (key, value) in parameters)
{
var parameter = command.CreateParameter();
parameter.ParameterName = key.StartsWith('@') ? key : "@" + key;
parameter.Value = value ?? DBNull.Value;
command.Parameters.Add(parameter);
}
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
} }
// ExternalSystemGateway-020: a JSON number that does not fit in Int64 must // ExternalSystemGateway-020: a JSON number that does not fit in Int64 must
@@ -1,3 +1,6 @@
using System.Data.Common;
using System.IO;
using System.Net.Sockets;
using Microsoft.Data.SqlClient; using Microsoft.Data.SqlClient;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway; namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
@@ -84,6 +87,60 @@ public static class SqlErrorClassifier
return IsTransient(exception.Number); return IsTransient(exception.Number);
} }
/// <summary>
/// Determines whether an arbitrary <see cref="Exception"/> represents a
/// transient database failure — the SQL-path parallel of
/// <see cref="ErrorClassifier.IsTransient(System.Exception)"/> on the HTTP path.
/// </summary>
/// <remarks>
/// <para>
/// A live DB outage does not always surface as a <see cref="SqlException"/>:
/// once the underlying connection / socket is torn down, the driver raises
/// transport-level exceptions instead. These are <b>retryable</b> — a retry
/// can succeed once the server is reachable again — so they are classified
/// transient (buffered to store-and-forward) rather than escaping unclassified
/// to crash the calling Script Execution Actor. The transient set:
/// </para>
/// <list type="bullet">
/// <item><see cref="InvalidOperationException"/> — connection-state error (e.g. "the connection is not open" / pooled connection broken).</item>
/// <item><see cref="IOException"/> — transport read/write failure mid-session.</item>
/// <item><see cref="SocketException"/> — TCP-level failure (connection refused/reset/timed out).</item>
/// <item><see cref="TimeoutException"/> — command / connection timeout surfaced as a CLR <see cref="TimeoutException"/>.</item>
/// <item><see cref="TaskCanceledException"/> — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches).</item>
/// <item>Any <see cref="DbException"/> that is NOT a <see cref="SqlException"/> — a provider/driver transport error (a real <see cref="SqlException"/> is classified by error number via the overloads above, never here).</item>
/// </list>
/// <para>
/// <b>Everything else is NOT transient</b> and must propagate, exactly as the
/// HTTP path lets genuinely-unexpected exceptions escape past its
/// <c>catch (Exception ex) when (ErrorClassifier.IsTransient(ex))</c> filter.
/// Authoring bugs (<see cref="ArgumentException"/>, <see cref="NullReferenceException"/>,
/// etc.) are loud, fixable failures — silently buffering and retrying them
/// forever would hide the bug.
/// </para>
/// </remarks>
/// <param name="exception">The exception to classify.</param>
/// <returns><see langword="true"/> for a transport/connection/timeout/driver exception; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(Exception exception)
{
ArgumentNullException.ThrowIfNull(exception);
// A real SqlException is classified by its error number (the overloads
// above), never by type — fall back to the number-based policy so an
// unknown SqlException stays permanent (fail-fast) rather than being
// swept up as transient by the DbException catch-all below.
if (exception is SqlException sql)
{
return IsTransient(sql);
}
return exception is InvalidOperationException
or IOException
or SocketException
or TimeoutException
or TaskCanceledException
or DbException; // any non-SqlException DbException (SqlException handled above)
}
/// <summary> /// <summary>
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching /// Classifies a <see cref="SqlException"/> and rethrows it as the matching
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a /// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
@@ -347,6 +347,132 @@ public class DatabaseGatewayTests
Assert.False(delivered); // permanent — the S&F engine parks the message Assert.False(delivered); // permanent — the S&F engine parks the message
} }
// ── M2.3 (#7) code-review fix: ExecuteWriteAsync must classify NON-SqlException
// DB outages as transient (buffer+retry) and propagate cancellation —
// mirroring the HTTP path's ordered catches in InvokeHttpAsync. The pre-fix
// code only caught SqlException, so a live outage surfacing as
// InvalidOperationException / SocketException / IOException / TimeoutException
// escaped unclassified and crashed the Script Execution Actor instead of
// buffering. These tests drive the RAW execution seam (RunSqlAsync) so the
// PRODUCTION classification in ExecuteWriteAsync runs end-to-end. ──
public static IEnumerable<object[]> TransientNonSqlOutages()
{
// A live DB outage that surfaces as a non-SqlException: connection-state,
// socket, IO, and timeout failures are all retryable transport errors.
yield return new object[] { new InvalidOperationException("The connection is not open.") };
yield return new object[] { new System.Net.Sockets.SocketException(10061 /* connection refused */) };
yield return new object[] { new System.IO.IOException("Unable to read data from the transport connection.") };
yield return new object[] { new TimeoutException("The operation has timed out.") };
}
[Theory]
[MemberData(nameof(TransientNonSqlOutages))]
public async Task CachedWrite_NonSqlOutage_ClassifiedTransient_BuffersNotCrash(Exception outage)
{
// [1] A live outage that is NOT a SqlException must be classified TRANSIENT
// (buffered for retry), NOT escape unclassified to crash the script actor,
// and NOT be returned as a permanent Failed result.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test")
{
Id = 1,
MaxRetries = 5,
RetryDelay = TimeSpan.FromSeconds(12),
};
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
// RawExecuteStubGateway routes the raw throw through the PRODUCTION
// ExecuteWriteAsync classification (the seam under test), unlike
// ExecuteStubGateway which throws an already-classified exception.
var gateway = new RawExecuteStubGateway(_repository, sf, onRunSql: () => throw outage);
var result = await gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)");
Assert.True(result.Success); // accepted for delivery, not a crash
Assert.True(result.WasBuffered); // handed to S&F as transient
Assert.Null(result.ErrorMessage); // not a permanent Failed result
Assert.Equal(1, ReadBufferDepth(connStr));
}
[Fact]
public async Task CachedWrite_CancellationRequested_PropagatesOperationCanceled_NotReclassified()
{
// [2] OperationCanceledException raised while the caller's token is
// cancelled must propagate UNCHANGED — never reclassified as a transient
// DB error and never buffered. Mirrors the HTTP path's first catch:
// `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;`
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
using var cts = new CancellationTokenSource();
cts.Cancel();
var gateway = new RawExecuteStubGateway(
_repository, sf, onRunSql: () => throw new OperationCanceledException(cts.Token));
await Assert.ThrowsAsync<OperationCanceledException>(
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)", cancellationToken: cts.Token));
// Cancellation is not a transient failure — nothing must have been buffered.
Assert.Equal(0, ReadBufferDepth(connStr));
}
[Fact]
public async Task CachedWrite_UnexpectedException_Propagates_NotClassifiedTransient()
{
// An exception type outside the transient transport set (e.g.
// ArgumentException) is NOT a DB outage — it must propagate, exactly as
// the HTTP path lets genuinely-unexpected exceptions escape past
// `catch (Exception ex) when (ErrorClassifier.IsTransient(ex))`.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
var gateway = new RawExecuteStubGateway(
_repository, sf, onRunSql: () => throw new ArgumentException("authoring bug"));
await Assert.ThrowsAsync<ArgumentException>(
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)"));
Assert.Equal(0, ReadBufferDepth(connStr));
}
[Fact]
public async Task DeliverBuffered_NonSqlOutage_RethrowsAsTransient_SoEngineRetries()
{
// [1] on the RETRY path: a non-SqlException outage during delivery must be
// classified transient and propagate (as TransientDatabaseException) so
// the S&F engine schedules another retry — it must NOT crash/park.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var gateway = new RawExecuteStubGateway(
_repository,
storeAndForward: null,
onRunSql: () => throw new InvalidOperationException("The connection is not open."));
var message = new ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardMessage
{
Id = Guid.NewGuid().ToString("N"),
Category = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
Target = "testDb",
PayloadJson =
"""{"ConnectionName":"testDb","Sql":"INSERT INTO t VALUES (1)","Parameters":null}""",
};
await Assert.ThrowsAsync<TransientDatabaseException>(
() => gateway.DeliverBufferedAsync(message));
}
/// <summary> /// <summary>
/// Reads the current buffered-message count off the S&amp;F SQLite DB by /// Reads the current buffered-message count off the S&amp;F SQLite DB by
/// counting <c>sf_messages</c> rows (the engine's persistence table). /// counting <c>sf_messages</c> rows (the engine's persistence table).
@@ -391,6 +517,38 @@ public class DatabaseGatewayTests
} }
} }
/// <summary>
/// Test gateway that substitutes the INNER SQL-execution seam
/// (<c>RunSqlAsync</c>) so a test can throw RAW exceptions (a real outage
/// shape: <see cref="InvalidOperationException"/>, <see cref="System.Net.Sockets.SocketException"/>,
/// etc.) and have them flow through the PRODUCTION
/// <c>ExecuteWriteAsync</c> classification (the catch ordering under test) —
/// unlike <see cref="ExecuteStubGateway"/>, which throws an
/// already-classified <see cref="TransientDatabaseException"/> /
/// <see cref="PermanentDatabaseException"/> and so bypasses the catches.
/// </summary>
private sealed class RawExecuteStubGateway : DatabaseGateway
{
private readonly Action _onRunSql;
public RawExecuteStubGateway(
IExternalSystemRepository repository,
ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardService? storeAndForward,
Action onRunSql)
: base(repository, NullLogger<DatabaseGateway>.Instance, storeAndForward)
=> _onRunSql = onRunSql;
internal override Task RunSqlAsync(
string connectionString,
string sql,
IReadOnlyDictionary<string, object?> parameters,
CancellationToken cancellationToken)
{
_onRunSql();
return Task.CompletedTask;
}
}
private static (int MaxRetries, long RetryIntervalMs, Guid? ExecutionId, string? SourceScript) private static (int MaxRetries, long RetryIntervalMs, Guid? ExecutionId, string? SourceScript)
ReadBufferedRetrySettings(string connStr) ReadBufferedRetrySettings(string connStr)
{ {
@@ -1,3 +1,5 @@
using System.Data.Common;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests; namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests;
/// <summary> /// <summary>
@@ -62,4 +64,42 @@ public class SqlErrorClassifierTests
// be silently retried forever. Unknown => permanent => false. // be silently retried forever. Unknown => permanent => false.
Assert.False(SqlErrorClassifier.IsTransient(errorNumber)); Assert.False(SqlErrorClassifier.IsTransient(errorNumber));
} }
// ── M2.3 (#7) code-review fix: IsTransient(Exception) — a live DB outage does
// not always surface as a SqlException. Transport/connection/timeout/driver
// exception types are transient (buffer+retry), mirroring the HTTP path's
// ErrorClassifier.IsTransient(Exception). ──
public static IEnumerable<object[]> TransientExceptionTypes()
{
yield return new object[] { new InvalidOperationException("connection not open") };
yield return new object[] { new System.IO.IOException("transport reset") };
yield return new object[] { new System.Net.Sockets.SocketException(10060) };
yield return new object[] { new TimeoutException("timed out") };
yield return new object[] { new TaskCanceledException("driver-level cancellation") };
// Any DbException that is NOT a SqlException is a driver/transport error.
yield return new object[] { new NonSqlDbException("provider transport error") };
}
[Theory]
[MemberData(nameof(TransientExceptionTypes))]
public void IsTransient_Exception_TrueForTransportTypes(Exception ex)
{
Assert.True(SqlErrorClassifier.IsTransient(ex));
}
[Fact]
public void IsTransient_Exception_FalseForUnexpectedType()
{
// Authoring bugs are NOT a DB outage — they must propagate, exactly as the
// HTTP path lets genuinely-unexpected exceptions escape its IsTransient filter.
Assert.False(SqlErrorClassifier.IsTransient(new ArgumentException("authoring bug")));
Assert.False(SqlErrorClassifier.IsTransient(new NullReferenceException()));
}
/// <summary>A concrete <see cref="DbException"/> that is not a SqlException, for the classifier unit test.</summary>
private sealed class NonSqlDbException : DbException
{
public NonSqlDbException(string message) : base(message) { }
}
} }