de375ff7ea
ExecuteWriteAsync only caught SqlException, so a live outage surfacing as InvalidOperationException/SocketException/IOException/TimeoutException escaped unclassified and crashed the script actor instead of buffering. Mirror the HTTP path: propagate OperationCanceledException on cancellation, classify transport exceptions as transient (buffer+retry), let unexpected exceptions propagate.
218 lines
11 KiB
C#
218 lines
11 KiB
C#
using System.Data.Common;
|
|
using System.IO;
|
|
using System.Net.Sockets;
|
|
using Microsoft.Data.SqlClient;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
|
|
|
|
/// <summary>
|
|
/// M2.3 (#7): classifies a SQL Server failure as transient (a brief wait /
|
|
/// retry may succeed — buffer to store-and-forward) or permanent (the identical
|
|
/// statement cannot succeed — return to the script / park the buffered message).
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// This is the database-side parallel of <see cref="ErrorClassifier"/> (the
|
|
/// HTTP path). The two are kept separate because the inputs differ: HTTP keys
|
|
/// off status codes / exception types, SQL keys off
|
|
/// <see cref="SqlException.Number"/>.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Transient set.</b> Only connection-loss, timeout, deadlock, and Azure SQL
|
|
/// throttle/availability error numbers are transient — failures whose cause is
|
|
/// external to the statement and may clear on its own:
|
|
/// <list type="bullet">
|
|
/// <item><c>-2</c> — query / command timeout expired.</item>
|
|
/// <item><c>-1</c> — a connection-level error (general SqlClient connection failure).</item>
|
|
/// <item><c>2</c> — SQL Server / network instance not found or not accessible.</item>
|
|
/// <item><c>53</c> — network path to the server was not found.</item>
|
|
/// <item><c>64</c> — connection terminated mid-session (transport error).</item>
|
|
/// <item><c>233</c> — no process on the other end of the named pipe.</item>
|
|
/// <item><c>1205</c> — the session was chosen as a deadlock victim.</item>
|
|
/// <item><c>10053</c> — transport-level abort (software caused connection abort).</item>
|
|
/// <item><c>10054</c> — connection reset by peer.</item>
|
|
/// <item><c>10060</c> — connection attempt timed out.</item>
|
|
/// <item><c>40197</c> — Azure SQL service error processing the request; retry.</item>
|
|
/// <item><c>40501</c> — Azure SQL service is busy.</item>
|
|
/// <item><c>40613</c> — Azure SQL database is currently unavailable.</item>
|
|
/// <item><c>49918</c> / <c>49919</c> / <c>49920</c> — Azure SQL throttling (too many requests / operations).</item>
|
|
/// </list>
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Everything else is permanent.</b> Constraint violations (547, 2627, 2601),
|
|
/// syntax errors (102, 156, 207, 208), and permission errors (229, 230, 262) are
|
|
/// the obvious permanent cases, but the policy is broader: <b>any error number not
|
|
/// in the transient set — including unknown / undocumented / ambiguous numbers —
|
|
/// is treated as permanent.</b> Fail-fast is the safer default: silently
|
|
/// retrying an unrecognised error forever (the pre-M2.3 behaviour) hides
|
|
/// authoring bugs and can replay duplicate side effects. A genuinely transient
|
|
/// number we have not enumerated will, at worst, surface to the script as a
|
|
/// permanent failure — a loud, fixable outcome — rather than spin in an
|
|
/// unbounded retry loop.
|
|
/// </para>
|
|
/// </remarks>
|
|
public static class SqlErrorClassifier
|
|
{
|
|
/// <summary>
|
|
/// The complete set of SQL Server error numbers treated as transient. See the
|
|
/// type-level remarks for the per-number rationale. Anything outside this set
|
|
/// is permanent.
|
|
/// </summary>
|
|
private static readonly HashSet<int> TransientErrorNumbers = new()
|
|
{
|
|
-2, -1, 2, 53, 64, 233, 1205,
|
|
10053, 10054, 10060,
|
|
40197, 40501, 40613,
|
|
49918, 49919, 49920,
|
|
};
|
|
|
|
/// <summary>
|
|
/// Determines whether a SQL Server error number represents a transient
|
|
/// failure. Unknown / undocumented numbers default to permanent
|
|
/// (<see langword="false"/>) — see the type-level remarks.
|
|
/// </summary>
|
|
/// <param name="errorNumber">The SQL Server error number (e.g. <see cref="SqlException.Number"/>).</param>
|
|
/// <returns><see langword="true"/> if the number is in the transient set; otherwise <see langword="false"/>.</returns>
|
|
public static bool IsTransient(int errorNumber) => TransientErrorNumbers.Contains(errorNumber);
|
|
|
|
/// <summary>
|
|
/// Determines whether a <see cref="SqlException"/> represents a transient
|
|
/// failure by classifying its top-level <see cref="SqlException.Number"/>.
|
|
/// </summary>
|
|
/// <param name="exception">The SQL exception to classify.</param>
|
|
/// <returns><see langword="true"/> if the exception's error number is transient; otherwise <see langword="false"/>.</returns>
|
|
public static bool IsTransient(SqlException exception)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(exception);
|
|
return IsTransient(exception.Number);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Determines whether an arbitrary <see cref="Exception"/> represents a
|
|
/// transient database failure — the SQL-path parallel of
|
|
/// <see cref="ErrorClassifier.IsTransient(System.Exception)"/> on the HTTP path.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// A live DB outage does not always surface as a <see cref="SqlException"/>:
|
|
/// once the underlying connection / socket is torn down, the driver raises
|
|
/// transport-level exceptions instead. These are <b>retryable</b> — a retry
|
|
/// can succeed once the server is reachable again — so they are classified
|
|
/// transient (buffered to store-and-forward) rather than escaping unclassified
|
|
/// to crash the calling Script Execution Actor. The transient set:
|
|
/// </para>
|
|
/// <list type="bullet">
|
|
/// <item><see cref="InvalidOperationException"/> — connection-state error (e.g. "the connection is not open" / pooled connection broken).</item>
|
|
/// <item><see cref="IOException"/> — transport read/write failure mid-session.</item>
|
|
/// <item><see cref="SocketException"/> — TCP-level failure (connection refused/reset/timed out).</item>
|
|
/// <item><see cref="TimeoutException"/> — command / connection timeout surfaced as a CLR <see cref="TimeoutException"/>.</item>
|
|
/// <item><see cref="TaskCanceledException"/> — driver-level cancellation/timeout NOT tied to a caller token (the caller-token case is handled before classification — see the gateway's ordered catches).</item>
|
|
/// <item>Any <see cref="DbException"/> that is NOT a <see cref="SqlException"/> — a provider/driver transport error (a real <see cref="SqlException"/> is classified by error number via the overloads above, never here).</item>
|
|
/// </list>
|
|
/// <para>
|
|
/// <b>Everything else is NOT transient</b> and must propagate, exactly as the
|
|
/// HTTP path lets genuinely-unexpected exceptions escape past its
|
|
/// <c>catch (Exception ex) when (ErrorClassifier.IsTransient(ex))</c> filter.
|
|
/// Authoring bugs (<see cref="ArgumentException"/>, <see cref="NullReferenceException"/>,
|
|
/// etc.) are loud, fixable failures — silently buffering and retrying them
|
|
/// forever would hide the bug.
|
|
/// </para>
|
|
/// </remarks>
|
|
/// <param name="exception">The exception to classify.</param>
|
|
/// <returns><see langword="true"/> for a transport/connection/timeout/driver exception; otherwise <see langword="false"/>.</returns>
|
|
public static bool IsTransient(Exception exception)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(exception);
|
|
|
|
// A real SqlException is classified by its error number (the overloads
|
|
// above), never by type — fall back to the number-based policy so an
|
|
// unknown SqlException stays permanent (fail-fast) rather than being
|
|
// swept up as transient by the DbException catch-all below.
|
|
if (exception is SqlException sql)
|
|
{
|
|
return IsTransient(sql);
|
|
}
|
|
|
|
return exception is InvalidOperationException
|
|
or IOException
|
|
or SocketException
|
|
or TimeoutException
|
|
or TaskCanceledException
|
|
or DbException; // any non-SqlException DbException (SqlException handled above)
|
|
}
|
|
|
|
/// <summary>
|
|
/// Classifies a <see cref="SqlException"/> and rethrows it as the matching
|
|
/// strongly-typed failure: <see cref="TransientDatabaseException"/> for a
|
|
/// transient error number, <see cref="PermanentDatabaseException"/> otherwise.
|
|
/// Mirrors <see cref="ErrorClassifier.AsTransient(string, System.Exception?)"/>
|
|
/// + the throw of <see cref="PermanentExternalSystemException"/> on the HTTP
|
|
/// path — the callers then branch on the typed exception rather than on the
|
|
/// raw <see cref="SqlException"/>.
|
|
/// </summary>
|
|
/// <param name="context">A short human-readable description of the failing operation (e.g. the connection name).</param>
|
|
/// <param name="exception">The SQL exception to classify and wrap.</param>
|
|
/// <returns>This method never returns normally — it always throws.</returns>
|
|
/// <exception cref="TransientDatabaseException">Thrown when the error number is transient.</exception>
|
|
/// <exception cref="PermanentDatabaseException">Thrown when the error number is permanent (the default).</exception>
|
|
public static Exception Throw(string context, SqlException exception)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(exception);
|
|
|
|
if (IsTransient(exception))
|
|
{
|
|
throw new TransientDatabaseException(
|
|
$"Transient SQL error {exception.Number} on {context}: {exception.Message}",
|
|
exception.Number,
|
|
exception);
|
|
}
|
|
|
|
throw new PermanentDatabaseException(
|
|
$"Permanent SQL error {exception.Number} on {context}: {exception.Message}",
|
|
exception.Number,
|
|
exception);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Signals a transient database failure suitable for store-and-forward retry —
|
|
/// the SQL-path parallel of <see cref="TransientExternalSystemException"/>.
|
|
/// </summary>
|
|
public class TransientDatabaseException : Exception
|
|
{
|
|
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
|
|
public int? SqlErrorNumber { get; }
|
|
|
|
/// <summary>Initializes a new <see cref="TransientDatabaseException"/>.</summary>
|
|
/// <param name="message">The error message.</param>
|
|
/// <param name="errorNumber">The SQL Server error number, if available.</param>
|
|
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
|
|
public TransientDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
|
|
: base(message, innerException)
|
|
{
|
|
SqlErrorNumber = errorNumber;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Signals a permanent database failure that must not be retried — the SQL-path
|
|
/// parallel of <see cref="PermanentExternalSystemException"/>. Returned
|
|
/// synchronously to the calling script on the immediate attempt and parks the
|
|
/// message on the store-and-forward retry path.
|
|
/// </summary>
|
|
public class PermanentDatabaseException : Exception
|
|
{
|
|
/// <summary>Gets the SQL Server error number that caused the failure, if known.</summary>
|
|
public int? SqlErrorNumber { get; }
|
|
|
|
/// <summary>Initializes a new <see cref="PermanentDatabaseException"/>.</summary>
|
|
/// <param name="message">The error message.</param>
|
|
/// <param name="errorNumber">The SQL Server error number, if available.</param>
|
|
/// <param name="innerException">Optional inner exception (typically the original <see cref="SqlException"/>).</param>
|
|
public PermanentDatabaseException(string message, int? errorNumber = null, Exception? innerException = null)
|
|
: base(message, innerException)
|
|
{
|
|
SqlErrorNumber = errorNumber;
|
|
}
|
|
}
|