fix(notification-service): resolve NotificationService-014..018 — classify OAuth2 failures, fail on bad auth config, wire NotificationOptions fallback, disposable concurrency limiter

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:33 -04:00
parent bf6bd8de5a
commit f5199e9da9
6 changed files with 454 additions and 41 deletions

View File

@@ -3,6 +3,7 @@ using System.Text.Json;
using MailKit;
using MailKit.Net.Smtp;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MimeKit;
using ScadaLink.Commons.Entities.Notifications;
using ScadaLink.Commons.Interfaces.Repositories;
@@ -18,26 +19,31 @@ namespace ScadaLink.NotificationService;
/// Transient: connection refused, timeout, SMTP 4xx → hand to S&F.
/// Permanent: SMTP 5xx → returned to script.
/// </summary>
public class NotificationDeliveryService : INotificationDeliveryService
public class NotificationDeliveryService : INotificationDeliveryService, IDisposable
{
private readonly INotificationRepository _repository;
private readonly Func<ISmtpClientWrapper> _smtpClientFactory;
private readonly OAuth2TokenService? _tokenService;
private readonly StoreAndForwardService? _storeAndForward;
private readonly ILogger<NotificationDeliveryService> _logger;
private readonly NotificationOptions _options;
public NotificationDeliveryService(
INotificationRepository repository,
Func<ISmtpClientWrapper> smtpClientFactory,
ILogger<NotificationDeliveryService> logger,
OAuth2TokenService? tokenService = null,
StoreAndForwardService? storeAndForward = null)
StoreAndForwardService? storeAndForward = null,
IOptions<NotificationOptions>? options = null)
{
_repository = repository;
_smtpClientFactory = smtpClientFactory;
_logger = logger;
_tokenService = tokenService;
_storeAndForward = storeAndForward;
// NS-017: NotificationOptions supplies the documented fallback values used
// when a deployed SmtpConfiguration row leaves a field unset (non-positive).
_options = options?.Value ?? new NotificationOptions();
}
/// <summary>
@@ -50,6 +56,8 @@ public class NotificationDeliveryService : INotificationDeliveryService
string? originInstanceName = null,
CancellationToken cancellationToken = default)
{
ObjectDisposedException.ThrowIf(_disposed, this);
var list = await _repository.GetListByNameAsync(listName, cancellationToken);
if (list == null)
{
@@ -146,6 +154,24 @@ public class NotificationDeliveryService : INotificationDeliveryService
return new NotificationResult(true, null, WasBuffered: true);
}
catch (Exception ex)
{
// NS-015: a failure that ClassifySmtpError does not recognise (Unknown) —
// most importantly an OAuth2 token-fetch failure (HttpRequestException
// from EnsureSuccessStatusCode, or InvalidOperationException from a
// malformed credential triple) — used to fall through all the catch
// clauses above and escape SendAsync as a raw exception to the calling
// script, which the INotificationDeliveryService contract never
// advertises. Convert any otherwise-unhandled exception into a clean,
// credential-scrubbed permanent NotificationResult: returning control to
// the script is the safe default. (A caller-requested cancellation is
// already re-thrown by the filter above and never reaches here.)
var detail = CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials);
_logger.LogError(
"Unclassified failure sending to list {List} ({ExceptionType}): {Detail}",
listName, ex.GetType().Name, detail);
return new NotificationResult(false, $"Notification delivery failed: {detail}");
}
}
/// <summary>
@@ -224,36 +250,103 @@ public class NotificationDeliveryService : INotificationDeliveryService
payload.ListName, CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
return false;
}
// Transient SMTP errors propagate out of DeliverAsync — the S&F engine retries.
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// A handler shutdown cancellation is neither a delivery success nor a
// permanent failure — let it propagate so the engine does not park.
throw;
}
catch (Exception ex) when (IsTransientSmtpError(ex, cancellationToken))
{
// A typed transient SMTP error: re-throw so the S&F engine retries.
throw;
}
catch (Exception ex)
{
// NS-014: an exception ClassifySmtpError does not recognise (Unknown) —
// chiefly an OAuth2 token-fetch failure — used to escape this handler.
// The S&F engine treats ANY thrown exception as transient, so a
// permanently-broken config (bad client secret, malformed credential
// triple) was retried on every sweep until MaxRetries, burning token
// endpoint calls. Decide deliberately rather than letting it leak:
// - an HttpRequestException with a 5xx token-endpoint status is a
// transient outage → re-throw so the engine retries;
// - everything else (a 4xx/401 token rejection, a malformed credential
// InvalidOperationException, any other unclassified fault) is not
// fixable by retrying → return false so the message is parked.
if (ex is HttpRequestException { StatusCode: { } status } && (int)status is >= 500 and < 600)
{
_logger.LogWarning(
"Buffered notification to list '{List}' hit a transient OAuth2 token-endpoint error ({Status}); will retry.",
payload.ListName, (int)status);
throw;
}
_logger.LogError(
"Buffered notification to list '{List}' failed with a non-retryable error ({ExceptionType}: {Detail}); parking.",
payload.ListName, ex.GetType().Name,
CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
return false;
}
}
private sealed record BufferedNotification(string ListName, string Subject, string Message);
/// <summary>
/// NS-007: throttles concurrent SMTP deliveries to the configured
/// <c>MaxConcurrentConnections</c>. Created lazily from the first SMTP config
/// seen (one SMTP config is deployed per site, so the limit is stable).
/// <c>MaxConcurrentConnections</c>. One SMTP config is deployed per site, so the
/// limit is a stable per-site invariant; it is captured lazily on first use.
/// NS-018: a <see cref="Lazy{T}"/> replaces the hand-rolled double-checked
/// init — its publication is correctly synchronised (no lock-free read of a
/// non-volatile field) and it is disposed in <see cref="Dispose"/>.
/// </summary>
private SemaphoreSlim? _concurrencyLimiter;
private Lazy<SemaphoreSlim>? _concurrencyLimiter;
private readonly object _limiterLock = new();
private bool _disposed;
private SemaphoreSlim GetConcurrencyLimiter(SmtpConfiguration config)
{
if (_concurrencyLimiter != null)
{
return _concurrencyLimiter;
}
// NS-018: the limiter is sized once; capture the size now so the Lazy
// factory does not close over a value that could change between calls.
var configured = config.MaxConcurrentConnections > 0
? config.MaxConcurrentConnections
// NS-017: fall back to the NotificationOptions value, then the
// design-doc default of 5, when the deployed row leaves it unset.
: _options.MaxConcurrentConnections > 0 ? _options.MaxConcurrentConnections : 5;
lock (_limiterLock)
{
// NS-007: a non-positive configured value would make SemaphoreSlim
// throw; fall back to the design-doc default of 5.
var max = config.MaxConcurrentConnections > 0 ? config.MaxConcurrentConnections : 5;
_concurrencyLimiter ??= new SemaphoreSlim(max, max);
return _concurrencyLimiter;
ObjectDisposedException.ThrowIf(_disposed, this);
_concurrencyLimiter ??= new Lazy<SemaphoreSlim>(
() => new SemaphoreSlim(configured, configured));
return _concurrencyLimiter.Value;
}
}
/// <summary>
/// NS-018: disposes the lazily-created concurrency limiter. The service is a
/// scoped DI service; without this the <see cref="SemaphoreSlim"/> leaked a
/// handle per scope.
/// </summary>
public void Dispose()
{
lock (_limiterLock)
{
if (_disposed)
{
return;
}
_disposed = true;
if (_concurrencyLimiter is { IsValueCreated: true } limiter)
{
limiter.Value.Dispose();
}
}
GC.SuppressFinalize(this);
}
/// <summary>
/// NS-008: Validates the sender and recipient email addresses, returning a
/// human-readable error string if any is malformed, or null if all parse.
@@ -300,8 +393,13 @@ public class NotificationDeliveryService : INotificationDeliveryService
try
{
// NS-005/NS-007: explicit TLS mode and the configured connection timeout.
// NS-017: when the deployed SmtpConfiguration row leaves the timeout
// unset (non-positive), fall back to the NotificationOptions value.
var timeoutSeconds = config.ConnectionTimeoutSeconds > 0
? config.ConnectionTimeoutSeconds
: _options.ConnectionTimeoutSeconds;
await smtp.ConnectAsync(
config.Host, config.Port, tlsMode, config.ConnectionTimeoutSeconds, cancellationToken);
config.Host, config.Port, tlsMode, timeoutSeconds, cancellationToken);
// Resolve credentials (OAuth2 token fetched/cached by the token service).
var credentials = config.Credentials;