Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/SqlErrorClassifier.cs
T
Joseph Doherty de375ff7ea fix(db): classify non-SqlException DB outages as transient; propagate cancellation (#7)
ExecuteWriteAsync only caught SqlException, so a live outage surfacing as
InvalidOperationException/SocketException/IOException/TimeoutException escaped
unclassified and crashed the script actor instead of buffering. Mirror the HTTP
path: propagate OperationCanceledException on cancellation, classify transport
exceptions as transient (buffer+retry), let unexpected exceptions propagate.
2026-06-15 14:03:25 -04:00

218 lines
11 KiB
C#

using System.Data.Common;
using System.IO;
using System.Net.Sockets;
using Microsoft.Data.SqlClient;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
/// <summary>
/// M2.3 (#7): classifies a SQL Server failure as transient (a brief wait /
/// retry may succeed — buffer to store-and-forward) or permanent (the identical
/// statement cannot succeed — return to the script / park the buffered message).
/// </summary>
/// <remarks>
/// <para>
/// This is the database-side parallel of <see cref="ErrorClassifier"/> (the
/// HTTP path). The two are kept separate because the inputs differ: HTTP keys
/// off status codes / exception types, SQL keys off
/// <see cref="SqlException.Number"/>.
/// </para>
/// <para>
/// <b>Transient set.</b> Only connection-loss, timeout, deadlock, and Azure SQL
/// throttle/availability error numbers are transient — failures whose cause is
/// external to the statement and may clear on its own:
/// <list type="bullet">
/// <item><c>-2</c> — query / command timeout expired.</item>
/// <item><c>-1</c> — a connection-level error (general SqlClient connection failure).</item>
/// <item><c>2</c> — SQL Server / network instance not found or not accessible.</item>
/// <item><c>53</c> — network path to the server was not found.</item>
/// <item><c>64</c> — connection terminated mid-session (transport error).</item>
/// <item><c>233</c> — no process on the other end of the named pipe.</item>
/// <item><c>1205</c> — the session was chosen as a deadlock victim.</item>
/// <item><c>10053</c> — transport-level abort (software caused connection abort).</item>
/// <item><c>10054</c> — connection reset by peer.</item>
/// <item><c>10060</c> — connection attempt timed out.</item>
/// <item><c>40197</c> — Azure SQL service error processing the request; retry.</item>
/// <item><c>40501</c> — Azure SQL service is busy.</item>
/// <item><c>40613</c> — Azure SQL database is currently unavailable.</item>
/// <item><c>49918</c> / <c>49919</c> / <c>49920</c> — Azure SQL throttling (too many requests / operations).</item>
/// </list>
/// </para>
/// <para>
/// <b>Everything else is permanent.</b> Constraint violations (547, 2627, 2601),
/// syntax errors (102, 156, 207, 208), and permission errors (229, 230, 262) are
/// the obvious permanent cases, but the policy is broader: <b>any error number not
/// in the transient set — including unknown / undocumented / ambiguous numbers —
/// is treated as permanent.</b> Fail-fast is the safer default: silently
/// retrying an unrecognised error forever (the pre-M2.3 behaviour) hides
/// authoring bugs and can replay duplicate side effects. A genuinely transient
/// number we have not enumerated will, at worst, surface to the script as a
/// permanent failure — a loud, fixable outcome — rather than spin in an
/// unbounded retry loop.
/// </para>
/// </remarks>
public static class SqlErrorClassifier
{
/// <summary>
/// The complete set of SQL Server error numbers treated as transient. See the
/// type-level remarks for the per-number rationale. Anything outside this set
/// is permanent.
/// </summary>
private static readonly HashSet<int> TransientErrorNumbers = new()
{
-2, -1, 2, 53, 64, 233, 1205,
10053, 10054, 10060,
40197, 40501, 40613,
49918, 49919, 49920,
};
/// <summary>
/// Determines whether a SQL Server error number represents a transient
/// failure. Unknown / undocumented numbers default to permanent
/// (<see langword="false"/>) — see the type-level remarks.
/// </summary>
/// <param name="errorNumber">The SQL Server error number (e.g. <see cref="SqlException.Number"/>).</param>
/// <returns><see langword="true"/> if the number is in the transient set; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(int errorNumber) => TransientErrorNumbers.Contains(errorNumber);
/// <summary>
/// Determines whether a <see cref="SqlException"/> represents a transient
/// failure by classifying its top-level <see cref="SqlException.Number"/>.
/// </summary>
/// <param name="exception">The SQL exception to classify.</param>
/// <returns><see langword="true"/> if the exception's error number is transient; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
return IsTransient(exception.Number);
}
/// <summary>
/// Determines whether an arbitrary <see cref="Exception"/> represents a
/// transient database failure — the SQL-path parallel of
/// <see cref="ErrorClassifier.IsTransient(System.Exception)"/> on the HTTP path.
/// </summary>
/// <remarks>
/// <para>
/// A live DB outage does not always surface as a <see cref="SqlException"/>:
/// once the underlying connection / socket is torn down, the driver raises
/// transport-level exceptions instead. These are <b>retryable</b> — a retry
/// can succeed once the server is reachable again — so they are classified
/// transient (buffered to store-and-forward) rather than escaping unclassified
/// to crash the calling Script Execution Actor. The transient set:
/// </para>
/// <list type="bullet">
/// <item><see cref="InvalidOperationException"/> — connection-state error (e.g. "the connection is not open" / pooled connection broken).</item>
/// <item><see cref="IOException"/> — transport read/write failure mid-session.</item>
/// <item><see cref="SocketException"/> — TCP-level failure (connection refused/reset/timed out).</item>
/// <item><see cref="TimeoutException"/> — command / connection timeout surfaced as a CLR <see cref="TimeoutException"/>.</item>
/// <item><see cref="TaskCanceledException"/> — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches).</item>
/// <item>Any <see cref="DbException"/> that is NOT a <see cref="SqlException"/> — a provider/driver transport error (a real <see cref="SqlException"/> is classified by error number via the overloads above, never here).</item>
/// </list>
/// <para>
/// <b>Everything else is NOT transient</b> and must propagate, exactly as the
/// HTTP path lets genuinely-unexpected exceptions escape past its
/// <c>catch (Exception ex) when (ErrorClassifier.IsTransient(ex))</c> filter.
/// Authoring bugs (<see cref="ArgumentException"/>, <see cref="NullReferenceException"/>,
/// etc.) are loud, fixable failures — silently buffering and retrying them
/// forever would hide the bug.
/// </para>
/// </remarks>
/// <param name="exception">The exception to classify.</param>
/// <returns><see langword="true"/> for a transport/connection/timeout/driver exception; otherwise <see langword="false"/>.</returns>
public static bool IsTransient(Exception exception)
{
ArgumentNullException.ThrowIfNull(exception);
// A real SqlException is classified by its error number (the overloads
// above), never by type — fall back to the number-based policy so an
// unknown SqlException stays permanent (fail-fast) rather than being
// swept up as transient by the DbException catch-all below.
if (exception is SqlException sql)
{
return IsTransient(sql);
}
return exception is InvalidOperationException
or IOException
or SocketException
or TimeoutException
or TaskCanceledException
or DbException; // any non-SqlException DbException (SqlException handled above)
}
/// <summary>
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
/// transient error number, <see cref="PermanentDatabaseException"/> otherwise.
/// Mirrors <see cref="ErrorClassifier.AsTransient(string, System.Exception?)"/>
/// + the throw of <see cref="PermanentExternalSystemException"/> on the HTTP
/// path — the callers then branch on the typed exception rather than on the
/// raw <see cref="SqlException"/>.
/// </summary>
/// <param name="context">A short human-readable description of the failing operation (e.g. the connection name).</param>
/// <param name="exception">The SQL exception to classify and wrap.</param>
/// <returns>This method never returns normally — it always throws.</returns>
/// <exception cref="TransientDatabaseException">Thrown when the error number is transient.</exception>
/// <exception cref="PermanentDatabaseException">Thrown when the error number is permanent (the default).</exception>
public static Exception Throw(string context, SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
if (IsTransient(exception))
{
throw new TransientDatabaseException(
$"Transient SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
throw new PermanentDatabaseException(
$"Permanent SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
}
/// <summary>
/// Signals a transient database failure suitable for store-and-forward retry —
/// the SQL-path parallel of <see cref="TransientExternalSystemException"/>.
/// </summary>
public class TransientDatabaseException : Exception
{
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
public int? SqlErrorNumber { get; }
/// <summary>Initializes a new <see cref="TransientDatabaseException"/>.</summary>
/// <param name="message">The error message.</param>
/// <param name="errorNumber">The SQL Server error number, if available.</param>
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
public TransientDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}
/// <summary>
/// Signals a permanent database failure that must not be retried — the SQL-path
/// parallel of <see cref="PermanentExternalSystemException"/>. Returned
/// synchronously to the calling script on the immediate attempt and parks the
/// message on the store-and-forward retry path.
/// </summary>
public class PermanentDatabaseException : Exception
{
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
public int? SqlErrorNumber { get; }
/// <summary>Initializes a new <see cref="PermanentDatabaseException"/>.</summary>
/// <param name="message">The error message.</param>
/// <param name="errorNumber">The SQL Server error number, if available.</param>
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
public PermanentDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}