From de375ff7ea46f9412317665da4ee794309dda37b Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 14:03:25 -0400 Subject: [PATCH] fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7) ExecuteWriteAsync only caught SqlException, so a live outage surfacing as InvalidOperationException/SocketException/IOException/TimeoutException escaped unclassified and crashed the script actor instead of buffering. Mirror the HTTP path: propagate OperationCanceledException on cancellation, classify transport exceptions as transient (buffer+retry), let unexpected exceptions propagate. --- .../DatabaseGateway.cs | 101 ++++++++--- .../SqlErrorClassifier.cs | 57 +++++++ .../DatabaseGatewayTests.cs | 158 ++++++++++++++++++ .../SqlErrorClassifierTests.cs | 40 +++++ 4 files changed, 335 insertions(+), 21 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs index 0096fc57..058cd67c 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/DatabaseGateway.cs @@ -281,15 +281,21 @@ public class DatabaseGateway : IDatabaseGateway /// /// M2.3 (#7): executes a parameterised SQL write against the given connection - /// string and classifies any into - /// / - /// via . This is the single SQL-execution seam - /// shared by the immediate attempt and the + /// string and classifies the outcome into + /// / , + /// mirroring the ordered catches of + /// on the API path: + /// caller-requested cancellation propagates unchanged; a + /// is classified by error number via ; a + /// non- transport/connection outage is classified + /// transient via ; + /// genuinely-unexpected exceptions propagate. This is the single classification + /// seam shared by the immediate attempt and the /// retry path. Marked internal virtual - /// so tests can substitute success / transient / permanent outcomes without a - /// real SQL Server (and without fabricating a , which - /// has no public constructor). Mirrors the role of - /// on the API path. + /// so tests can substitute already-classified outcomes; the raw I/O lives in + /// the inner seam so tests can also drive raw outage + /// exceptions through this classification (without fabricating a + /// , which has no public constructor). /// /// The human-readable connection name, used only for the classified error message (never the connection string — that would leak credentials into logs / script-visible errors). /// The ADO.NET connection string to write through. @@ -297,7 +303,8 @@ public class DatabaseGateway : IDatabaseGateway /// Materialised CLR parameter values (may be empty). /// Cancellation token for the write. /// A task that completes when the write succeeds. - /// Thrown for a transient SQL error number. + /// Rethrown unchanged when the caller's requested cancellation. + /// Thrown for a transient SQL error number or a non-Sql transport/connection outage. /// Thrown for a permanent (or unknown) SQL error number. internal virtual async Task ExecuteWriteAsync( string connectionName, @@ -306,20 +313,28 @@ public class DatabaseGateway : IDatabaseGateway IReadOnlyDictionary parameters, CancellationToken cancellationToken) { + // M2.3 (#7) code-review fix: the catch ordering MIRRORS + // ExternalSystemClient.InvokeHttpAsync exactly so the SQL path classifies + // a live outage the same way the HTTP path does: + // 1. caller-requested cancellation propagates UNCHANGED (never a "DB error"); + // 2. a SqlException is classified by error number (transient/permanent); + // 3. a NON-SqlException transport/connection failure (InvalidOperationException + // "connection not open", IOException, SocketException, TimeoutException, + // a non-Sql DbException, …) is TRANSIENT — buffered + retried, because a + // retry can succeed once the server is reachable. The pre-fix code only + // caught SqlException, so these escaped unclassified and crashed the + // Script Execution Actor instead of buffering; + // 4. genuinely-unexpected exceptions (e.g. an authoring ArgumentException) + // propagate — same as the HTTP path lets unexpected exceptions escape. try { - await using var connection = new SqlConnection(connectionString); - await connection.OpenAsync(cancellationToken).ConfigureAwait(false); - using var command = connection.CreateCommand(); - command.CommandText = sql; - foreach (var (key, value) in parameters) - { - var parameter = command.CreateParameter(); - parameter.ParameterName = key.StartsWith('@') ? key : "@" + key; - parameter.Value = value ?? DBNull.Value; - command.Parameters.Add(parameter); - } - await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false); + await RunSqlAsync(connectionString, sql, parameters, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // [2] The caller asked to abandon the work — propagate the cancellation + // unchanged; it must never be reclassified as a transient DB error. + throw; } catch (SqlException ex) { @@ -328,6 +343,50 @@ public class DatabaseGateway : IDatabaseGateway // is the connection NAME, never the connection string. throw SqlErrorClassifier.Throw(connectionName, ex); } + catch (Exception ex) when (SqlErrorClassifier.IsTransient(ex)) + { + // [1] A live outage that did not surface as a SqlException — treat as + // transient so the caller buffers + retries. The message uses the + // connection NAME, never the connection string (credential safety). + throw new TransientDatabaseException( + $"Transient database error on {connectionName}: {ex.Message}", + errorNumber: null, + ex); + } + } + + /// + /// M2.3 (#7): the raw ADO.NET write — opens the connection, builds the + /// command, and executes it. Marked internal virtual so tests can throw + /// RAW outage-shaped exceptions (e.g. , + /// ) through the PRODUCTION + /// classification in . This is the SQL parallel + /// of client.SendAsync inside : + /// the actual I/O, wrapped by the ordered classification catches in the caller. + /// + /// The ADO.NET connection string to write through. + /// The SQL statement to execute. + /// Materialised CLR parameter values (may be empty). + /// Cancellation token for the write. + /// A task that completes when the write succeeds. + internal virtual async Task RunSqlAsync( + string connectionString, + string sql, + IReadOnlyDictionary parameters, + CancellationToken cancellationToken) + { + await using var connection = new SqlConnection(connectionString); + await connection.OpenAsync(cancellationToken).ConfigureAwait(false); + using var command = connection.CreateCommand(); + command.CommandText = sql; + foreach (var (key, value) in parameters) + { + var parameter = command.CreateParameter(); + parameter.ParameterName = key.StartsWith('@') ? key : "@" + key; + parameter.Value = value ?? DBNull.Value; + command.Parameters.Add(parameter); + } + await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false); } // ExternalSystemGateway-020: a JSON number that does not fit in Int64 must diff --git a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/SqlErrorClassifier.cs b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/SqlErrorClassifier.cs index f1e7a89f..bdba8298 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/SqlErrorClassifier.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/SqlErrorClassifier.cs @@ -1,3 +1,6 @@ +using System.Data.Common; +using System.IO; +using System.Net.Sockets; using Microsoft.Data.SqlClient; namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway; @@ -84,6 +87,60 @@ public static class SqlErrorClassifier return IsTransient(exception.Number); } + /// + /// Determines whether an arbitrary represents a + /// transient database failure — the SQL-path parallel of + /// on the HTTP path. + /// + /// + /// + /// A live DB outage does not always surface as a : + /// once the underlying connection / socket is torn down, the driver raises + /// transport-level exceptions instead. These are retryable — a retry + /// can succeed once the server is reachable again — so they are classified + /// transient (buffered to store-and-forward) rather than escaping unclassified + /// to crash the calling Script Execution Actor. The transient set: + /// + /// + /// — connection-state error (e.g. "the connection is not open" / pooled connection broken). + /// — transport read/write failure mid-session. + /// — TCP-level failure (connection refused/reset/timed out). + /// — command / connection timeout surfaced as a CLR . + /// — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches). + /// Any that is NOT a — a provider/driver transport error (a real is classified by error number via the overloads above, never here). + /// + /// + /// Everything else is NOT transient and must propagate, exactly as the + /// HTTP path lets genuinely-unexpected exceptions escape past its + /// catch (Exception ex) when (ErrorClassifier.IsTransient(ex)) filter. + /// Authoring bugs (, , + /// etc.) are loud, fixable failures — silently buffering and retrying them + /// forever would hide the bug. + /// + /// + /// The exception to classify. + /// for a transport/connection/timeout/driver exception; otherwise . + public static bool IsTransient(Exception exception) + { + ArgumentNullException.ThrowIfNull(exception); + + // A real SqlException is classified by its error number (the overloads + // above), never by type — fall back to the number-based policy so an + // unknown SqlException stays permanent (fail-fast) rather than being + // swept up as transient by the DbException catch-all below. + if (exception is SqlException sql) + { + return IsTransient(sql); + } + + return exception is InvalidOperationException + or IOException + or SocketException + or TimeoutException + or TaskCanceledException + or DbException; // any non-SqlException DbException (SqlException handled above) + } + /// /// Classifies a and rethrows it as the matching /// strongly-typed failure: for a diff --git a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs index 46d18014..c77da622 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/DatabaseGatewayTests.cs @@ -347,6 +347,132 @@ public class DatabaseGatewayTests Assert.False(delivered); // permanent — the S&F engine parks the message } + // ── M2.3 (#7) code-review fix: ExecuteWriteAsync must classify NON-SqlException + // DB outages as transient (buffer+retry) and propagate cancellation — + // mirroring the HTTP path's ordered catches in InvokeHttpAsync. The pre-fix + // code only caught SqlException, so a live outage surfacing as + // InvalidOperationException / SocketException / IOException / TimeoutException + // escaped unclassified and crashed the Script Execution Actor instead of + // buffering. These tests drive the RAW execution seam (RunSqlAsync) so the + // PRODUCTION classification in ExecuteWriteAsync runs end-to-end. ── + + public static IEnumerable TransientNonSqlOutages() + { + // A live DB outage that surfaces as a non-SqlException: connection-state, + // socket, IO, and timeout failures are all retryable transport errors. + yield return new object[] { new InvalidOperationException("The connection is not open.") }; + yield return new object[] { new System.Net.Sockets.SocketException(10061 /* connection refused */) }; + yield return new object[] { new System.IO.IOException("Unable to read data from the transport connection.") }; + yield return new object[] { new TimeoutException("The operation has timed out.") }; + } + + [Theory] + [MemberData(nameof(TransientNonSqlOutages))] + public async Task CachedWrite_NonSqlOutage_ClassifiedTransient_BuffersNotCrash(Exception outage) + { + // [1] A live outage that is NOT a SqlException must be classified TRANSIENT + // (buffered for retry), NOT escape unclassified to crash the script actor, + // and NOT be returned as a permanent Failed result. + var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") + { + Id = 1, + MaxRetries = 5, + RetryDelay = TimeSpan.FromSeconds(12), + }; + StubConnection(conn); + + var (sf, connStr, keepAlive) = NewStoreAndForward(); + using var _ = keepAlive; + + // RawExecuteStubGateway routes the raw throw through the PRODUCTION + // ExecuteWriteAsync classification (the seam under test), unlike + // ExecuteStubGateway which throws an already-classified exception. + var gateway = new RawExecuteStubGateway(_repository, sf, onRunSql: () => throw outage); + + var result = await gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)"); + + Assert.True(result.Success); // accepted for delivery, not a crash + Assert.True(result.WasBuffered); // handed to S&F as transient + Assert.Null(result.ErrorMessage); // not a permanent Failed result + + Assert.Equal(1, ReadBufferDepth(connStr)); + } + + [Fact] + public async Task CachedWrite_CancellationRequested_PropagatesOperationCanceled_NotReclassified() + { + // [2] OperationCanceledException raised while the caller's token is + // cancelled must propagate UNCHANGED — never reclassified as a transient + // DB error and never buffered. Mirrors the HTTP path's first catch: + // `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) throw;` + var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 }; + StubConnection(conn); + + var (sf, connStr, keepAlive) = NewStoreAndForward(); + using var _ = keepAlive; + + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + var gateway = new RawExecuteStubGateway( + _repository, sf, onRunSql: () => throw new OperationCanceledException(cts.Token)); + + await Assert.ThrowsAsync( + () => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)", cancellationToken: cts.Token)); + + // Cancellation is not a transient failure — nothing must have been buffered. + Assert.Equal(0, ReadBufferDepth(connStr)); + } + + [Fact] + public async Task CachedWrite_UnexpectedException_Propagates_NotClassifiedTransient() + { + // An exception type outside the transient transport set (e.g. + // ArgumentException) is NOT a DB outage — it must propagate, exactly as + // the HTTP path lets genuinely-unexpected exceptions escape past + // `catch (Exception ex) when (ErrorClassifier.IsTransient(ex))`. + var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 }; + StubConnection(conn); + + var (sf, connStr, keepAlive) = NewStoreAndForward(); + using var _ = keepAlive; + + var gateway = new RawExecuteStubGateway( + _repository, sf, onRunSql: () => throw new ArgumentException("authoring bug")); + + await Assert.ThrowsAsync( + () => gateway.CachedWriteAsync("testDb", "INSERT INTO t VALUES (1)")); + + Assert.Equal(0, ReadBufferDepth(connStr)); + } + + [Fact] + public async Task DeliverBuffered_NonSqlOutage_RethrowsAsTransient_SoEngineRetries() + { + // [1] on the RETRY path: a non-SqlException outage during delivery must be + // classified transient and propagate (as TransientDatabaseException) so + // the S&F engine schedules another retry — it must NOT crash/park. + var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 }; + StubConnection(conn); + + var gateway = new RawExecuteStubGateway( + _repository, + storeAndForward: null, + onRunSql: () => throw new InvalidOperationException("The connection is not open.")); + + var message = new ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardMessage + { + Id = Guid.NewGuid().ToString("N"), + Category = ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.CachedDbWrite, + Target = "testDb", + PayloadJson = + """{"ConnectionName":"testDb","Sql":"INSERT INTO t VALUES (1)","Parameters":null}""", + }; + + await Assert.ThrowsAsync( + () => gateway.DeliverBufferedAsync(message)); + } + /// /// Reads the current buffered-message count off the S&F SQLite DB by /// counting sf_messages rows (the engine's persistence table). @@ -391,6 +517,38 @@ public class DatabaseGatewayTests } } + /// + /// Test gateway that substitutes the INNER SQL-execution seam + /// (RunSqlAsync) so a test can throw RAW exceptions (a real outage + /// shape: , , + /// etc.) and have them flow through the PRODUCTION + /// ExecuteWriteAsync classification (the catch ordering under test) — + /// unlike , which throws an + /// already-classified / + /// and so bypasses the catches. + /// + private sealed class RawExecuteStubGateway : DatabaseGateway + { + private readonly Action _onRunSql; + + public RawExecuteStubGateway( + IExternalSystemRepository repository, + ZB.MOM.WW.ScadaBridge.StoreAndForward.StoreAndForwardService? storeAndForward, + Action onRunSql) + : base(repository, NullLogger.Instance, storeAndForward) + => _onRunSql = onRunSql; + + internal override Task RunSqlAsync( + string connectionString, + string sql, + IReadOnlyDictionary parameters, + CancellationToken cancellationToken) + { + _onRunSql(); + return Task.CompletedTask; + } + } + private static (int MaxRetries, long RetryIntervalMs, Guid? ExecutionId, string? SourceScript) ReadBufferedRetrySettings(string connStr) { diff --git a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/SqlErrorClassifierTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/SqlErrorClassifierTests.cs index 406013d6..4664661e 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/SqlErrorClassifierTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests/SqlErrorClassifierTests.cs @@ -1,3 +1,5 @@ +using System.Data.Common; + namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway.Tests; /// @@ -62,4 +64,42 @@ public class SqlErrorClassifierTests // be silently retried forever. Unknown => permanent => false. Assert.False(SqlErrorClassifier.IsTransient(errorNumber)); } + + // ── M2.3 (#7) code-review fix: IsTransient(Exception) — a live DB outage does + // not always surface as a SqlException. Transport/connection/timeout/driver + // exception types are transient (buffer+retry), mirroring the HTTP path's + // ErrorClassifier.IsTransient(Exception). ── + + public static IEnumerable TransientExceptionTypes() + { + yield return new object[] { new InvalidOperationException("connection not open") }; + yield return new object[] { new System.IO.IOException("transport reset") }; + yield return new object[] { new System.Net.Sockets.SocketException(10060) }; + yield return new object[] { new TimeoutException("timed out") }; + yield return new object[] { new TaskCanceledException("driver-level cancellation") }; + // Any DbException that is NOT a SqlException is a driver/transport error. + yield return new object[] { new NonSqlDbException("provider transport error") }; + } + + [Theory] + [MemberData(nameof(TransientExceptionTypes))] + public void IsTransient_Exception_TrueForTransportTypes(Exception ex) + { + Assert.True(SqlErrorClassifier.IsTransient(ex)); + } + + [Fact] + public void IsTransient_Exception_FalseForUnexpectedType() + { + // Authoring bugs are NOT a DB outage — they must propagate, exactly as the + // HTTP path lets genuinely-unexpected exceptions escape its IsTransient filter. + Assert.False(SqlErrorClassifier.IsTransient(new ArgumentException("authoring bug"))); + Assert.False(SqlErrorClassifier.IsTransient(new NullReferenceException())); + } + + /// A concrete that is not a SqlException, for the classifier unit test. + private sealed class NonSqlDbException : DbException + { + public NonSqlDbException(string message) : base(message) { } + } }