fix(db): classify transient vs permanent SQL errors in Database.CachedWrite (#7)
CachedWrite buffered ALL write failures and retried forever, never returning a synchronous failure to the script — permanent SQL errors (constraint/syntax/ permission) were treated as transient. Mirror the External-System API path: attempt immediately, return Failed synchronously on permanent SQL errors (no buffering), buffer only transient errors; the S&F retry path parks permanent failures instead of retrying forever. New SqlErrorClassifier + PermanentDatabaseException.
This commit is contained in:
@@ -0,0 +1,160 @@
|
||||
using Microsoft.Data.SqlClient;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
|
||||
|
||||
/// <summary>
|
||||
/// M2.3 (#7): classifies a SQL Server failure as transient (a brief wait /
|
||||
/// retry may succeed — buffer to store-and-forward) or permanent (the identical
|
||||
/// statement cannot succeed — return to the script / park the buffered message).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This is the database-side parallel of <see cref="ErrorClassifier"/> (the
|
||||
/// HTTP path). The two are kept separate because the inputs differ: HTTP keys
|
||||
/// off status codes / exception types, SQL keys off
|
||||
/// <see cref="SqlException.Number"/>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Transient set.</b> Only connection-loss, timeout, deadlock, and Azure SQL
|
||||
/// throttle/availability error numbers are transient — failures whose cause is
|
||||
/// external to the statement and may clear on its own:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>-2</c> — query / command timeout expired.</item>
|
||||
/// <item><c>-1</c> — a connection-level error (general SqlClient connection failure).</item>
|
||||
/// <item><c>2</c> — SQL Server / network instance not found or not accessible.</item>
|
||||
/// <item><c>53</c> — network path to the server was not found.</item>
|
||||
/// <item><c>64</c> — connection terminated mid-session (transport error).</item>
|
||||
/// <item><c>233</c> — no process on the other end of the named pipe.</item>
|
||||
/// <item><c>1205</c> — the session was chosen as a deadlock victim.</item>
|
||||
/// <item><c>10053</c> — transport-level abort (software caused connection abort).</item>
|
||||
/// <item><c>10054</c> — connection reset by peer.</item>
|
||||
/// <item><c>10060</c> — connection attempt timed out.</item>
|
||||
/// <item><c>40197</c> — Azure SQL service error processing the request; retry.</item>
|
||||
/// <item><c>40501</c> — Azure SQL service is busy.</item>
|
||||
/// <item><c>40613</c> — Azure SQL database is currently unavailable.</item>
|
||||
/// <item><c>49918</c> / <c>49919</c> / <c>49920</c> — Azure SQL throttling (too many requests / operations).</item>
|
||||
/// </list>
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Everything else is permanent.</b> Constraint violations (547, 2627, 2601),
|
||||
/// syntax errors (102, 156, 207, 208), and permission errors (229, 230, 262) are
|
||||
/// the obvious permanent cases, but the policy is broader: <b>any error number not
|
||||
/// in the transient set — including unknown / undocumented / ambiguous numbers —
|
||||
/// is treated as permanent.</b> Fail-fast is the safer default: silently
|
||||
/// retrying an unrecognised error forever (the pre-M2.3 behaviour) hides
|
||||
/// authoring bugs and can replay duplicate side effects. A genuinely transient
|
||||
/// number we have not enumerated will, at worst, surface to the script as a
|
||||
/// permanent failure — a loud, fixable outcome — rather than spin in an
|
||||
/// unbounded retry loop.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static class SqlErrorClassifier
|
||||
{
|
||||
/// <summary>
|
||||
/// The complete set of SQL Server error numbers treated as transient. See the
|
||||
/// type-level remarks for the per-number rationale. Anything outside this set
|
||||
/// is permanent.
|
||||
/// </summary>
|
||||
private static readonly HashSet<int> TransientErrorNumbers = new()
|
||||
{
|
||||
-2, -1, 2, 53, 64, 233, 1205,
|
||||
10053, 10054, 10060,
|
||||
40197, 40501, 40613,
|
||||
49918, 49919, 49920,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether a SQL Server error number represents a transient
|
||||
/// failure. Unknown / undocumented numbers default to permanent
|
||||
/// (<see langword="false"/>) — see the type-level remarks.
|
||||
/// </summary>
|
||||
/// <param name="errorNumber">The SQL Server error number (e.g. <see cref="SqlException.Number"/>).</param>
|
||||
/// <returns><see langword="true"/> if the number is in the transient set; otherwise <see langword="false"/>.</returns>
|
||||
public static bool IsTransient(int errorNumber) => TransientErrorNumbers.Contains(errorNumber);
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether a <see cref="SqlException"/> represents a transient
|
||||
/// failure by classifying its top-level <see cref="SqlException.Number"/>.
|
||||
/// </summary>
|
||||
/// <param name="exception">The SQL exception to classify.</param>
|
||||
/// <returns><see langword="true"/> if the exception's error number is transient; otherwise <see langword="false"/>.</returns>
|
||||
public static bool IsTransient(SqlException exception)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(exception);
|
||||
return IsTransient(exception.Number);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching
|
||||
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
|
||||
/// transient error number, <see cref="PermanentDatabaseException"/> otherwise.
|
||||
/// Mirrors <see cref="ErrorClassifier.AsTransient(string, System.Exception?)"/>
|
||||
/// + the throw of <see cref="PermanentExternalSystemException"/> on the HTTP
|
||||
/// path — the callers then branch on the typed exception rather than on the
|
||||
/// raw <see cref="SqlException"/>.
|
||||
/// </summary>
|
||||
/// <param name="context">A short human-readable description of the failing operation (e.g. the connection name).</param>
|
||||
/// <param name="exception">The SQL exception to classify and wrap.</param>
|
||||
/// <returns>This method never returns normally — it always throws.</returns>
|
||||
/// <exception cref="TransientDatabaseException">Thrown when the error number is transient.</exception>
|
||||
/// <exception cref="PermanentDatabaseException">Thrown when the error number is permanent (the default).</exception>
|
||||
public static Exception Throw(string context, SqlException exception)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(exception);
|
||||
|
||||
if (IsTransient(exception))
|
||||
{
|
||||
throw new TransientDatabaseException(
|
||||
$"Transient SQL error {exception.Number} on {context}: {exception.Message}",
|
||||
exception.Number,
|
||||
exception);
|
||||
}
|
||||
|
||||
throw new PermanentDatabaseException(
|
||||
$"Permanent SQL error {exception.Number} on {context}: {exception.Message}",
|
||||
exception.Number,
|
||||
exception);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signals a transient database failure suitable for store-and-forward retry —
|
||||
/// the SQL-path parallel of <see cref="TransientExternalSystemException"/>.
|
||||
/// </summary>
|
||||
public class TransientDatabaseException : Exception
|
||||
{
|
||||
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
|
||||
public int? SqlErrorNumber { get; }
|
||||
|
||||
/// <summary>Initializes a new <see cref="TransientDatabaseException"/>.</summary>
|
||||
/// <param name="message">The error message.</param>
|
||||
/// <param name="errorNumber">The SQL Server error number, if available.</param>
|
||||
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
|
||||
public TransientDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
|
||||
: base(message, innerException)
|
||||
{
|
||||
SqlErrorNumber = errorNumber;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signals a permanent database failure that must not be retried — the SQL-path
|
||||
/// parallel of <see cref="PermanentExternalSystemException"/>. Returned
|
||||
/// synchronously to the calling script on the immediate attempt and parks the
|
||||
/// message on the store-and-forward retry path.
|
||||
/// </summary>
|
||||
public class PermanentDatabaseException : Exception
|
||||
{
|
||||
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
|
||||
public int? SqlErrorNumber { get; }
|
||||
|
||||
/// <summary>Initializes a new <see cref="PermanentDatabaseException"/>.</summary>
|
||||
/// <param name="message">The error message.</param>
|
||||
/// <param name="errorNumber">The SQL Server error number, if available.</param>
|
||||
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
|
||||
public PermanentDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
|
||||
: base(message, innerException)
|
||||
{
|
||||
SqlErrorNumber = errorNumber;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user