fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7)

ExecuteWriteAsync only caught SqlException, so a live outage surfacing as
InvalidOperationException/SocketException/IOException/TimeoutException escaped
unclassified and crashed the script actor instead of buffering. Mirror the HTTP
path: propagate OperationCanceledException on cancellation, classify transport
exceptions as transient (buffer+retry), let unexpected exceptions propagate.
This commit is contained in:
Joseph Doherty
2026-06-15 14:03:25 -04:00
parent d05270640d
commit de375ff7ea
4 changed files with 335 additions and 21 deletions
@@ -347,6 +347,132 @@ public class DatabaseGatewayTests
Assert.False(delivered); // permanent — the S&F engine parks the message
}
// ── M2.3 (#7) code-review fix: ExecuteWriteAsync must classify NON-SqlException
// DB outages as transient (buffer+retry) and propagate cancellation —
// mirroring the HTTP path's ordered catches in InvokeHttpAsync. The pre-fix
// code only caught SqlException, so a live outage surfacing as
// InvalidOperationException / SocketException / IOException / TimeoutException
// escaped unclassified and crashed the Script Execution Actor instead of
// buffering. These tests drive the RAW execution seam (RunSqlAsync) so the
// PRODUCTION classification in ExecuteWriteAsync runs end-to-end. ──
public static IEnumerable<object[]> TransientNonSqlOutages()
{
// A live DB outage that surfaces as a non-SqlException: connection-state,
// socket, IO, and timeout failures are all retryable transport errors.
yield return new object[] { new InvalidOperationException("The connection is not open.") };
yield return new object[] { new System.Net.Sockets.SocketException(10061 /* connection refused */) };
yield return new object[] { new System.IO.IOException("Unable to read data from the transport connection.") };
yield return new object[] { new TimeoutException("The operation has timed out.") };
}
[Theory]
[MemberData(nameof(TransientNonSqlOutages))]
public async Task CachedWrite_NonSqlOutage_ClassifiedTransient_BuffersNotCrash(Exception outage)
{
// [1] A live outage that is NOT a SqlException must be classified TRANSIENT
// (buffered for retry), NOT escape unclassified to crash the script actor,
// and NOT be returned as a permanent Failed result.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test")
{
Id = 1,
MaxRetries = 5,
RetryDelay = TimeSpan.FromSeconds(12),
};
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
// RawExecuteStubGateway routes the raw throw through the PRODUCTION
// ExecuteWriteAsync classification (the seam under test), unlike
// ExecuteStubGateway which throws an already-classified exception.
var gateway = new RawExecuteStubGateway(_repository, sf, onRunSql: () => throw outage);
var result = await gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)");
Assert.True(result.Success); // accepted for delivery, not a crash
Assert.True(result.WasBuffered); // handed to S&F as transient
Assert.Null(result.ErrorMessage); // not a permanent Failed result
Assert.Equal(1, ReadBufferDepth(connStr));
}
[Fact]
public async Task CachedWrite_CancellationRequested_PropagatesOperationCanceled_NotReclassified()
{
// [2] OperationCanceledException raised while the caller's token is
// cancelled must propagate UNCHANGED — never reclassified as a transient
// DB error and never buffered. Mirrors the HTTP path's first catch:
// `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;`
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
using var cts = new CancellationTokenSource();
cts.Cancel();
var gateway = new RawExecuteStubGateway(
_repository, sf, onRunSql: () => throw new OperationCanceledException(cts.Token));
await Assert.ThrowsAsync<OperationCanceledException>(
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)", cancellationToken: cts.Token));
// Cancellation is not a transient failure — nothing must have been buffered.
Assert.Equal(0, ReadBufferDepth(connStr));
}
[Fact]
public async Task CachedWrite_UnexpectedException_Propagates_NotClassifiedTransient()
{
// An exception type outside the transient transport set (e.g.
// ArgumentException) is NOT a DB outage — it must propagate, exactly as
// the HTTP path lets genuinely-unexpected exceptions escape past
// `catch (Exception ex) when (ErrorClassifier.IsTransient(ex))`.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var (sf, connStr, keepAlive) = NewStoreAndForward();
using var _ = keepAlive;
var gateway = new RawExecuteStubGateway(
_repository, sf, onRunSql: () => throw new ArgumentException("authoring bug"));
await Assert.ThrowsAsync<ArgumentException>(
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)"));
Assert.Equal(0, ReadBufferDepth(connStr));
}
[Fact]
public async Task DeliverBuffered_NonSqlOutage_RethrowsAsTransient_SoEngineRetries()
{
// [1] on the RETRY path: a non-SqlException outage during delivery must be
// classified transient and propagate (as TransientDatabaseException) so
// the S&F engine schedules another retry — it must NOT crash/park.
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
StubConnection(conn);
var gateway = new RawExecuteStubGateway(
_repository,
storeAndForward: null,
onRunSql: () => throw new InvalidOperationException("The connection is not open."));
var message = new ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardMessage
{
Id = Guid.NewGuid().ToString("N"),
Category = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
Target = "testDb",
PayloadJson =
"""{"ConnectionName":"testDb","Sql":"INSERT INTO t VALUES (1)","Parameters":null}""",
};
await Assert.ThrowsAsync<TransientDatabaseException>(
() => gateway.DeliverBufferedAsync(message));
}
/// <summary>
/// Reads the current buffered-message count off the S&amp;F SQLite DB by
/// counting <c>sf_messages</c> rows (the engine's persistence table).
@@ -391,6 +517,38 @@ public class DatabaseGatewayTests
}
}
/// <summary>
/// Test gateway that substitutes the INNER SQL-execution seam
/// (<c>RunSqlAsync</c>) so a test can throw RAW exceptions (a real outage
/// shape: <see cref="InvalidOperationException"/>, <see cref="System.Net.Sockets.SocketException"/>,
/// etc.) and have them flow through the PRODUCTION
/// <c>ExecuteWriteAsync</c> classification (the catch ordering under test) —
/// unlike <see cref="ExecuteStubGateway"/>, which throws an
/// already-classified <see cref="TransientDatabaseException"/> /
/// <see cref="PermanentDatabaseException"/> and so bypasses the catches.
/// </summary>
private sealed class RawExecuteStubGateway : DatabaseGateway
{
private readonly Action _onRunSql;
public RawExecuteStubGateway(
IExternalSystemRepository repository,
ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardService? storeAndForward,
Action onRunSql)
: base(repository, NullLogger<DatabaseGateway>.Instance, storeAndForward)
=> _onRunSql = onRunSql;
internal override Task RunSqlAsync(
string connectionString,
string sql,
IReadOnlyDictionary<string, object?> parameters,
CancellationToken cancellationToken)
{
_onRunSql();
return Task.CompletedTask;
}
}
private static (int MaxRetries, long RetryIntervalMs, Guid? ExecutionId, string? SourceScript)
ReadBufferedRetrySettings(string connStr)
{
@@ -1,3 +1,5 @@
using System.Data.Common;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests;
/// <summary>
@@ -62,4 +64,42 @@ public class SqlErrorClassifierTests
// be silently retried forever. Unknown => permanent => false.
Assert.False(SqlErrorClassifier.IsTransient(errorNumber));
}
// ── M2.3 (#7) code-review fix: IsTransient(Exception) — a live DB outage does
// not always surface as a SqlException. Transport/connection/timeout/driver
// exception types are transient (buffer+retry), mirroring the HTTP path's
// ErrorClassifier.IsTransient(Exception). ──
public static IEnumerable<object[]> TransientExceptionTypes()
{
yield return new object[] { new InvalidOperationException("connection not open") };
yield return new object[] { new System.IO.IOException("transport reset") };
yield return new object[] { new System.Net.Sockets.SocketException(10060) };
yield return new object[] { new TimeoutException("timed out") };
yield return new object[] { new TaskCanceledException("driver-level cancellation") };
// Any DbException that is NOT a SqlException is a driver/transport error.
yield return new object[] { new NonSqlDbException("provider transport error") };
}
[Theory]
[MemberData(nameof(TransientExceptionTypes))]
public void IsTransient_Exception_TrueForTransportTypes(Exception ex)
{
Assert.True(SqlErrorClassifier.IsTransient(ex));
}
[Fact]
public void IsTransient_Exception_FalseForUnexpectedType()
{
// Authoring bugs are NOT a DB outage — they must propagate, exactly as the
// HTTP path lets genuinely-unexpected exceptions escape its IsTransient filter.
Assert.False(SqlErrorClassifier.IsTransient(new ArgumentException("authoring bug")));
Assert.False(SqlErrorClassifier.IsTransient(new NullReferenceException()));
}
/// <summary>A concrete <see cref="DbException"/> that is not a SqlException, for the classifier unit test.</summary>
private sealed class NonSqlDbException : DbException
{
public NonSqlDbException(string message) : base(message) { }
}
}