fix(db): classify transient vs permanent SQL errors in Database.CachedWrite (#7)

CachedWrite buffered ALL write failures and retried forever, never returning a
synchronous failure to the script — permanent SQL errors (constraint/syntax/
permission) were treated as transient. Mirror the External-System API path:
attempt immediately, return Failed synchronously on permanent SQL errors (no
buffering), buffer only transient errors; the S&F retry path parks permanent
failures instead of retrying forever. New SqlErrorClassifier + PermanentDatabaseException.
This commit is contained in:
Joseph Doherty
2026-06-15 13:53:15 -04:00
parent 198770f578
commit d05270640d
7 changed files with 907 additions and 29 deletions
@@ -0,0 +1,160 @@
using Microsoft.Data.SqlClient;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
/// <summary>
/// M2.3 (#7): classifies a SQL Server failure as transient (a brief wait /
/// retry may succeed — buffer to store-and-forward) or permanent (the identical
/// statement cannot succeed — return to the script / park the buffered message).
/// </summary>
/// <remarks>
/// <para>
/// This is the database-side parallel of <see cref="ErrorClassifier"/> (the
/// HTTP path). The two are kept separate because the inputs differ: HTTP keys
/// off status codes / exception types, SQL keys off
/// <see cref="SqlException.Number"/>.
/// </para>
/// <para>
/// <b>Transient set.</b> Only connection-loss, timeout, deadlock, and Azure SQL
/// throttle/availability error numbers are transient — failures whose cause is
/// external to the statement and may clear on its own:
/// <list type="bullet">
/// <item><c>-2</c> — query / command timeout expired.</item>
/// <item><c>-1</c> — a connection-level error (general SqlClient connection failure).</item>
/// <item><c>2</c> — SQL Server / network instance not found or not accessible.</item>
/// <item><c>53</c> — network path to the server was not found.</item>
/// <item><c>64</c> — connection terminated mid-session (transport error).</item>
/// <item><c>233</c> — no process on the other end of the named pipe.</item>
/// <item><c>1205</c> — the session was chosen as a deadlock victim.</item>
/// <item><c>10053</c> — transport-level abort (software caused connection abort).</item>
/// <item><c>10054</c> — connection reset by peer.</item>
/// <item><c>10060</c> — connection attempt timed out.</item>
/// <item><c>40197</c> — Azure SQL service error processing the request; retry.</item>
/// <item><c>40501</c> — Azure SQL service is busy.</item>
/// <item><c>40613</c> — Azure SQL database is currently unavailable.</item>
/// <item><c>49918</c> / <c>49919</c> / <c>49920</c> — Azure SQL throttling (too many requests / operations).</item>
/// </list>
/// </para>
/// <para>
/// <b>Everything else is permanent.</b> Constraint violations (547, 2627, 2601),
/// syntax errors (102, 156, 207, 208), and permission errors (229, 230, 262) are
/// the obvious permanent cases, but the policy is broader: <b>any error number not
/// in the transient set — including unknown / undocumented / ambiguous numbers —
/// is treated as permanent.</b> Fail-fast is the safer default: silently
/// retrying an unrecognised error forever (the pre-M2.3 behaviour) hides
/// authoring bugs and can replay duplicate side effects. A genuinely transient
/// number we have not enumerated will, at worst, surface to the script as a
/// permanent failure — a loud, fixable outcome — rather than spin in an
/// unbounded retry loop.
/// </para>
/// </remarks>
public static class SqlErrorClassifier
{
/// <summary>
/// The complete set of SQL Server error numbers treated as transient. See the
/// type-level remarks for the per-number rationale. Anything outside this set
/// is permanent.
/// </summary>
private static readonly HashSet<int> TransientErrorNumbers = new()
{
-2, -1, 2, 53, 64, 233, 1205,
10053, 10054, 10060,
40197, 40501, 40613,
49918, 49919, 49920,
};
/// <summary>
/// Determines whether a SQL Server error number represents a transient
/// failure. Unknown / undocumented numbers default to permanent
/// (<see langword="false"/>) — see the type-level remarks.
/// </summary>
/// <param name="errorNumber">The SQL Server error number (e.g. <see cref="SqlException.Number"/>).</param>
/// <returns><see langword="true"/> if the number is in the transient set; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(int errorNumber) => TransientErrorNumbers.Contains(errorNumber);
/// <summary>
/// Determines whether a <see cref="SqlException"/> represents a transient
/// failure by classifying its top-level <see cref="SqlException.Number"/>.
/// </summary>
/// <param name="exception">The SQL exception to classify.</param>
/// <returns><see langword="true"/> if the exception's error number is transient; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
return IsTransient(exception.Number);
}
/// <summary>
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
/// transient error number, <see cref="PermanentDatabaseException"/> otherwise.
/// Mirrors <see cref="ErrorClassifier.AsTransient(string, System.Exception?)"/>
/// + the throw of <see cref="PermanentExternalSystemException"/> on the HTTP
/// path — the callers then branch on the typed exception rather than on the
/// raw <see cref="SqlException"/>.
/// </summary>
/// <param name="context">A short human-readable description of the failing operation (e.g. the connection name).</param>
/// <param name="exception">The SQL exception to classify and wrap.</param>
/// <returns>This method never returns normally — it always throws.</returns>
/// <exception cref="TransientDatabaseException">Thrown when the error number is transient.</exception>
/// <exception cref="PermanentDatabaseException">Thrown when the error number is permanent (the default).</exception>
public static Exception Throw(string context, SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
if (IsTransient(exception))
{
throw new TransientDatabaseException(
$"Transient SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
throw new PermanentDatabaseException(
$"Permanent SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
}
/// <summary>
/// Signals a transient database failure suitable for store-and-forward retry —
/// the SQL-path parallel of <see cref="TransientExternalSystemException"/>.
/// </summary>
public class TransientDatabaseException : Exception
{
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
public int? SqlErrorNumber { get; }
/// <summary>Initializes a new <see cref="TransientDatabaseException"/>.</summary>
/// <param name="message">The error message.</param>
/// <param name="errorNumber">The SQL Server error number, if available.</param>
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
public TransientDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}
/// <summary>
/// Signals a permanent database failure that must not be retried — the SQL-path
/// parallel of <see cref="PermanentExternalSystemException"/>. Returned
/// synchronously to the calling script on the immediate attempt and parks the
/// message on the store-and-forward retry path.
/// </summary>
public class PermanentDatabaseException : Exception
{
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
public int? SqlErrorNumber { get; }
/// <summary>Initializes a new <see cref="PermanentDatabaseException"/>.</summary>
/// <param name="message">The error message.</param>
/// <param name="errorNumber">The SQL Server error number, if available.</param>
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
public PermanentDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}