fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7)
ExecuteWriteAsync only caught SqlException, so a live outage surfacing as InvalidOperationException/SocketException/IOException/TimeoutException escaped unclassified and crashed the script actor instead of buffering. Mirror the HTTP path: propagate OperationCanceledException on cancellation, classify transport exceptions as transient (buffer+retry), let unexpected exceptions propagate.
This commit is contained in:
@@ -347,6 +347,132 @@ public class DatabaseGatewayTests
|
||||
Assert.False(delivered); // permanent — the S&F engine parks the message
|
||||
}
|
||||
|
||||
// ── M2.3 (#7) code-review fix: ExecuteWriteAsync must classify NON-SqlException
|
||||
// DB outages as transient (buffer+retry) and propagate cancellation —
|
||||
// mirroring the HTTP path's ordered catches in InvokeHttpAsync. The pre-fix
|
||||
// code only caught SqlException, so a live outage surfacing as
|
||||
// InvalidOperationException / SocketException / IOException / TimeoutException
|
||||
// escaped unclassified and crashed the Script Execution Actor instead of
|
||||
// buffering. These tests drive the RAW execution seam (RunSqlAsync) so the
|
||||
// PRODUCTION classification in ExecuteWriteAsync runs end-to-end. ──
|
||||
|
||||
public static IEnumerable<object[]> TransientNonSqlOutages()
|
||||
{
|
||||
// A live DB outage that surfaces as a non-SqlException: connection-state,
|
||||
// socket, IO, and timeout failures are all retryable transport errors.
|
||||
yield return new object[] { new InvalidOperationException("The connection is not open.") };
|
||||
yield return new object[] { new System.Net.Sockets.SocketException(10061 /* connection refused */) };
|
||||
yield return new object[] { new System.IO.IOException("Unable to read data from the transport connection.") };
|
||||
yield return new object[] { new TimeoutException("The operation has timed out.") };
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(TransientNonSqlOutages))]
|
||||
public async Task CachedWrite_NonSqlOutage_ClassifiedTransient_BuffersNotCrash(Exception outage)
|
||||
{
|
||||
// [1] A live outage that is NOT a SqlException must be classified TRANSIENT
|
||||
// (buffered for retry), NOT escape unclassified to crash the script actor,
|
||||
// and NOT be returned as a permanent Failed result.
|
||||
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test")
|
||||
{
|
||||
Id = 1,
|
||||
MaxRetries = 5,
|
||||
RetryDelay = TimeSpan.FromSeconds(12),
|
||||
};
|
||||
StubConnection(conn);
|
||||
|
||||
var (sf, connStr, keepAlive) = NewStoreAndForward();
|
||||
using var _ = keepAlive;
|
||||
|
||||
// RawExecuteStubGateway routes the raw throw through the PRODUCTION
|
||||
// ExecuteWriteAsync classification (the seam under test), unlike
|
||||
// ExecuteStubGateway which throws an already-classified exception.
|
||||
var gateway = new RawExecuteStubGateway(_repository, sf, onRunSql: () => throw outage);
|
||||
|
||||
var result = await gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)");
|
||||
|
||||
Assert.True(result.Success); // accepted for delivery, not a crash
|
||||
Assert.True(result.WasBuffered); // handed to S&F as transient
|
||||
Assert.Null(result.ErrorMessage); // not a permanent Failed result
|
||||
|
||||
Assert.Equal(1, ReadBufferDepth(connStr));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CachedWrite_CancellationRequested_PropagatesOperationCanceled_NotReclassified()
|
||||
{
|
||||
// [2] OperationCanceledException raised while the caller's token is
|
||||
// cancelled must propagate UNCHANGED — never reclassified as a transient
|
||||
// DB error and never buffered. Mirrors the HTTP path's first catch:
|
||||
// `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;`
|
||||
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
|
||||
StubConnection(conn);
|
||||
|
||||
var (sf, connStr, keepAlive) = NewStoreAndForward();
|
||||
using var _ = keepAlive;
|
||||
|
||||
using var cts = new CancellationTokenSource();
|
||||
cts.Cancel();
|
||||
|
||||
var gateway = new RawExecuteStubGateway(
|
||||
_repository, sf, onRunSql: () => throw new OperationCanceledException(cts.Token));
|
||||
|
||||
await Assert.ThrowsAsync<OperationCanceledException>(
|
||||
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)", cancellationToken: cts.Token));
|
||||
|
||||
// Cancellation is not a transient failure — nothing must have been buffered.
|
||||
Assert.Equal(0, ReadBufferDepth(connStr));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CachedWrite_UnexpectedException_Propagates_NotClassifiedTransient()
|
||||
{
|
||||
// An exception type outside the transient transport set (e.g.
|
||||
// ArgumentException) is NOT a DB outage — it must propagate, exactly as
|
||||
// the HTTP path lets genuinely-unexpected exceptions escape past
|
||||
// `catch (Exception ex) when (ErrorClassifier.IsTransient(ex))`.
|
||||
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
|
||||
StubConnection(conn);
|
||||
|
||||
var (sf, connStr, keepAlive) = NewStoreAndForward();
|
||||
using var _ = keepAlive;
|
||||
|
||||
var gateway = new RawExecuteStubGateway(
|
||||
_repository, sf, onRunSql: () => throw new ArgumentException("authoring bug"));
|
||||
|
||||
await Assert.ThrowsAsync<ArgumentException>(
|
||||
() => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)"));
|
||||
|
||||
Assert.Equal(0, ReadBufferDepth(connStr));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeliverBuffered_NonSqlOutage_RethrowsAsTransient_SoEngineRetries()
|
||||
{
|
||||
// [1] on the RETRY path: a non-SqlException outage during delivery must be
|
||||
// classified transient and propagate (as TransientDatabaseException) so
|
||||
// the S&F engine schedules another retry — it must NOT crash/park.
|
||||
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
|
||||
StubConnection(conn);
|
||||
|
||||
var gateway = new RawExecuteStubGateway(
|
||||
_repository,
|
||||
storeAndForward: null,
|
||||
onRunSql: () => throw new InvalidOperationException("The connection is not open."));
|
||||
|
||||
var message = new ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardMessage
|
||||
{
|
||||
Id = Guid.NewGuid().ToString("N"),
|
||||
Category = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite,
|
||||
Target = "testDb",
|
||||
PayloadJson =
|
||||
"""{"ConnectionName":"testDb","Sql":"INSERT INTO t VALUES (1)","Parameters":null}""",
|
||||
};
|
||||
|
||||
await Assert.ThrowsAsync<TransientDatabaseException>(
|
||||
() => gateway.DeliverBufferedAsync(message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads the current buffered-message count off the S&F SQLite DB by
|
||||
/// counting <c>sf_messages</c> rows (the engine's persistence table).
|
||||
@@ -391,6 +517,38 @@ public class DatabaseGatewayTests
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test gateway that substitutes the INNER SQL-execution seam
|
||||
/// (<c>RunSqlAsync</c>) so a test can throw RAW exceptions (a real outage
|
||||
/// shape: <see cref="InvalidOperationException"/>, <see cref="System.Net.Sockets.SocketException"/>,
|
||||
/// etc.) and have them flow through the PRODUCTION
|
||||
/// <c>ExecuteWriteAsync</c> classification (the catch ordering under test) —
|
||||
/// unlike <see cref="ExecuteStubGateway"/>, which throws an
|
||||
/// already-classified <see cref="TransientDatabaseException"/> /
|
||||
/// <see cref="PermanentDatabaseException"/> and so bypasses the catches.
|
||||
/// </summary>
|
||||
private sealed class RawExecuteStubGateway : DatabaseGateway
|
||||
{
|
||||
private readonly Action _onRunSql;
|
||||
|
||||
public RawExecuteStubGateway(
|
||||
IExternalSystemRepository repository,
|
||||
ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardService? storeAndForward,
|
||||
Action onRunSql)
|
||||
: base(repository, NullLogger<DatabaseGateway>.Instance, storeAndForward)
|
||||
=> _onRunSql = onRunSql;
|
||||
|
||||
internal override Task RunSqlAsync(
|
||||
string connectionString,
|
||||
string sql,
|
||||
IReadOnlyDictionary<string, object?> parameters,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
_onRunSql();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
private static (int MaxRetries, long RetryIntervalMs, Guid? ExecutionId, string? SourceScript)
|
||||
ReadBufferedRetrySettings(string connStr)
|
||||
{
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using System.Data.Common;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests;
|
||||
|
||||
/// <summary>
|
||||
@@ -62,4 +64,42 @@ public class SqlErrorClassifierTests
|
||||
// be silently retried forever. Unknown => permanent => false.
|
||||
Assert.False(SqlErrorClassifier.IsTransient(errorNumber));
|
||||
}
|
||||
|
||||
// ── M2.3 (#7) code-review fix: IsTransient(Exception) — a live DB outage does
|
||||
// not always surface as a SqlException. Transport/connection/timeout/driver
|
||||
// exception types are transient (buffer+retry), mirroring the HTTP path's
|
||||
// ErrorClassifier.IsTransient(Exception). ──
|
||||
|
||||
public static IEnumerable<object[]> TransientExceptionTypes()
|
||||
{
|
||||
yield return new object[] { new InvalidOperationException("connection not open") };
|
||||
yield return new object[] { new System.IO.IOException("transport reset") };
|
||||
yield return new object[] { new System.Net.Sockets.SocketException(10060) };
|
||||
yield return new object[] { new TimeoutException("timed out") };
|
||||
yield return new object[] { new TaskCanceledException("driver-level cancellation") };
|
||||
// Any DbException that is NOT a SqlException is a driver/transport error.
|
||||
yield return new object[] { new NonSqlDbException("provider transport error") };
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(TransientExceptionTypes))]
|
||||
public void IsTransient_Exception_TrueForTransportTypes(Exception ex)
|
||||
{
|
||||
Assert.True(SqlErrorClassifier.IsTransient(ex));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsTransient_Exception_FalseForUnexpectedType()
|
||||
{
|
||||
// Authoring bugs are NOT a DB outage — they must propagate, exactly as the
|
||||
// HTTP path lets genuinely-unexpected exceptions escape its IsTransient filter.
|
||||
Assert.False(SqlErrorClassifier.IsTransient(new ArgumentException("authoring bug")));
|
||||
Assert.False(SqlErrorClassifier.IsTransient(new NullReferenceException()));
|
||||
}
|
||||
|
||||
/// <summary>A concrete <see cref="DbException"/> that is not a SqlException, for the classifier unit test.</summary>
|
||||
private sealed class NonSqlDbException : DbException
|
||||
{
|
||||
public NonSqlDbException(string message) : base(message) { }
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user