using System.Data.Common;
using System.IO;
using System.Net.Sockets;
using Microsoft.Data.SqlClient;
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
///
/// M2.3 (#7): classifies a SQL Server failure as transient (a brief wait /
/// retry may succeed — buffer to store-and-forward) or permanent (the identical
/// statement cannot succeed — return to the script / park the buffered message).
///
///
///
/// This is the database-side parallel of (the
/// HTTP path). The two are kept separate because the inputs differ: HTTP keys
/// off status codes / exception types, SQL keys off
/// .
///
///
/// Transient set. Only connection-loss, timeout, deadlock, and Azure SQL
/// throttle/availability error numbers are transient — failures whose cause is
/// external to the statement and may clear on its own:
///
/// - -2 — query / command timeout expired.
/// - -1 — a connection-level error (general SqlClient connection failure).
/// - 2 — SQL Server / network instance not found or not accessible.
/// - 53 — network path to the server was not found.
/// - 64 — connection terminated mid-session (transport error).
/// - 233 — no process on the other end of the named pipe.
/// - 1205 — the session was chosen as a deadlock victim.
/// - 10053 — transport-level abort (software caused connection abort).
/// - 10054 — connection reset by peer.
/// - 10060 — connection attempt timed out.
/// - 40197 — Azure SQL service error processing the request; retry.
/// - 40501 — Azure SQL service is busy.
/// - 40613 — Azure SQL database is currently unavailable.
/// - 49918 / 49919 / 49920 — Azure SQL throttling (too many requests / operations).
///
///
///
/// Everything else is permanent. Constraint violations (547, 2627, 2601),
/// syntax errors (102, 156, 207, 208), and permission errors (229, 230, 262) are
/// the obvious permanent cases, but the policy is broader: any error number not
/// in the transient set — including unknown / undocumented / ambiguous numbers —
/// is treated as permanent. Fail-fast is the safer default: silently
/// retrying an unrecognised error forever (the pre-M2.3 behaviour) hides
/// authoring bugs and can replay duplicate side effects. A genuinely transient
/// number we have not enumerated will, at worst, surface to the script as a
/// permanent failure — a loud, fixable outcome — rather than spin in an
/// unbounded retry loop.
///
///
public static class SqlErrorClassifier
{
///
/// The complete set of SQL Server error numbers treated as transient. See the
/// type-level remarks for the per-number rationale. Anything outside this set
/// is permanent.
///
private static readonly HashSet TransientErrorNumbers = new()
{
-2, -1, 2, 53, 64, 233, 1205,
10053, 10054, 10060,
40197, 40501, 40613,
49918, 49919, 49920,
};
///
/// Determines whether a SQL Server error number represents a transient
/// failure. Unknown / undocumented numbers default to permanent
/// () — see the type-level remarks.
///
/// The SQL Server error number (e.g. ).
/// if the number is in the transient set; otherwise .
public static bool IsTransient(int errorNumber) => TransientErrorNumbers.Contains(errorNumber);
///
/// Determines whether a represents a transient
/// failure by classifying its top-level .
///
/// The SQL exception to classify.
/// if the exception's error number is transient; otherwise .
public static bool IsTransient(SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
return IsTransient(exception.Number);
}
///
/// Determines whether an arbitrary represents a
/// transient database failure — the SQL-path parallel of
/// on the HTTP path.
///
///
///
/// A live DB outage does not always surface as a :
/// once the underlying connection / socket is torn down, the driver raises
/// transport-level exceptions instead. These are retryable — a retry
/// can succeed once the server is reachable again — so they are classified
/// transient (buffered to store-and-forward) rather than escaping unclassified
/// to crash the calling Script Execution Actor. The transient set:
///
///
/// - — connection-state error (e.g. "the connection is not open" / pooled connection broken).
/// - — transport read/write failure mid-session.
/// - — TCP-level failure (connection refused/reset/timed out).
/// - — command / connection timeout surfaced as a CLR .
/// - — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches).
/// - Any that is NOT a — a provider/driver transport error (a real is classified by error number via the overloads above, never here).
///
///
/// Everything else is NOT transient and must propagate, exactly as the
/// HTTP path lets genuinely-unexpected exceptions escape past its
/// catch (Exception ex) when (ErrorClassifier.IsTransient(ex)) filter.
/// Authoring bugs (, ,
/// etc.) are loud, fixable failures — silently buffering and retrying them
/// forever would hide the bug.
///
///
/// The exception to classify.
/// for a transport/connection/timeout/driver exception; otherwise .
public static bool IsTransient(Exception exception)
{
ArgumentNullException.ThrowIfNull(exception);
// A real SqlException is classified by its error number (the overloads
// above), never by type — fall back to the number-based policy so an
// unknown SqlException stays permanent (fail-fast) rather than being
// swept up as transient by the DbException catch-all below.
if (exception is SqlException sql)
{
return IsTransient(sql);
}
return exception is InvalidOperationException
or IOException
or SocketException
or TimeoutException
or TaskCanceledException
or DbException; // any non-SqlException DbException (SqlException handled above)
}
///
/// Classifies a and rethrows it as the matching
/// strongly-typed failure: for a
/// transient error number, otherwise.
/// Mirrors
/// + the throw of on the HTTP
/// path — the callers then branch on the typed exception rather than on the
/// raw .
///
/// A short human-readable description of the failing operation (e.g. the connection name).
/// The SQL exception to classify and wrap.
/// This method never returns normally — it always throws.
/// Thrown when the error number is transient.
/// Thrown when the error number is permanent (the default).
public static Exception Throw(string context, SqlException exception)
{
ArgumentNullException.ThrowIfNull(exception);
if (IsTransient(exception))
{
throw new TransientDatabaseException(
$"Transient SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
throw new PermanentDatabaseException(
$"Permanent SQL error {exception.Number} on {context}: {exception.Message}",
exception.Number,
exception);
}
}
///
/// Signals a transient database failure suitable for store-and-forward retry —
/// the SQL-path parallel of .
///
public class TransientDatabaseException : Exception
{
/// Gets the SQL Server error number that caused the failure, if known.
public int? SqlErrorNumber { get; }
/// Initializes a new .
/// The error message.
/// The SQL Server error number, if available.
/// Optional inner exception (typically the original ).
public TransientDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}
///
/// Signals a permanent database failure that must not be retried — the SQL-path
/// parallel of . Returned
/// synchronously to the calling script on the immediate attempt and parks the
/// message on the store-and-forward retry path.
///
public class PermanentDatabaseException : Exception
{
/// Gets the SQL Server error number that caused the failure, if known.
public int? SqlErrorNumber { get; }
/// Initializes a new .
/// The error message.
/// The SQL Server error number, if available.
/// Optional inner exception (typically the original ).
public PermanentDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
: base(message, innerException)
{
SqlErrorNumber = errorNumber;
}
}