fix(notifications): close OAuth2 SMTP + dispatcher resilience gaps (5 findings)

NS-021/NO-001: thread FromAddress into XOAUTH2 so M365 stops rejecting
sends with 535 5.7.3. Added an additive oauth2UserName parameter on
ISmtpClientWrapper.AuthenticateAsync; both NotificationService and
NotificationOutbox now pass config.FromAddress.

NO-002: clamp non-positive SmtpConfiguration.MaxRetries/RetryDelay to the
1-min / 10-attempt fallback with a Warning so a misconfigured row no
longer parks transient failures on the first attempt or burn-loops.

NO-003: route a lifecycle-scoped CancellationToken from the
NotificationOutboxActor through the dispatch sweep into the adapter so
in-flight SMTP sends abort on PostStop instead of blocking
CoordinatedShutdown for the full SMTP timeout per row.

NO-004: await the central audit writer inside the existing try/catch
instead of fire-and-forget so the audit task can't outlive the per-sweep
DI scope and writer faults reach the operator log instead of being
silently dropped.

Two AuditLog integration tests seeded RetryDelay = TimeSpan.Zero to force
immediate re-claim on the second tick; updated them to 1 ms so they keep
the same intent without tripping the NO-002 clamp.
This commit is contained in:
Joseph Doherty
2026-05-28 03:54:43 -04:00
parent e536178323
commit 291274ae76
13 changed files with 370 additions and 61 deletions
@@ -365,11 +365,13 @@ public class AuditWriteFailureSafetyTests : TestKit, IClassFixture<MsSqlMigratio
private async Task SeedSmtpConfigAsync()
{
await using var ctx = CreateContext();
// NO-002: dispatcher clamps non-positive RetryDelay to the 1-minute fallback;
// use 1 ms so a transient outcome's NextAttemptAt is still effectively due.
ctx.SmtpConfigurations.Add(new SmtpConfiguration(
"smtp.example.com", "Basic", "noreply@example.com")
{
MaxRetries = 5,
RetryDelay = TimeSpan.Zero,
RetryDelay = TimeSpan.FromMilliseconds(1),
});
await ctx.SaveChangesAsync();
}
@@ -130,9 +130,12 @@ public class NotifyDispatcherAuditTrailTests : TestKit, IClassFixture<MsSqlMigra
/// <summary>
/// Inserts a single SMTP configuration row so the dispatcher's
/// <c>ResolveRetryPolicyAsync</c> sees a real (maxRetries, retryDelay)
/// pair rather than the conservative fallback. RetryDelay of 0 means a
/// transient outcome's <c>NextAttemptAt</c> is immediately due — useful so
/// the SECOND DispatchTick re-claims the row without waiting.
/// pair rather than the conservative fallback. A tiny positive RetryDelay
/// means a transient outcome's <c>NextAttemptAt</c> is immediately due —
/// useful so the SECOND DispatchTick re-claims the row without waiting.
/// NO-002: the dispatcher now clamps a non-positive RetryDelay to the
/// 1-minute fallback to avoid burn-looping on transient failures, so this
/// must be a strictly positive value (1 ms is fine for tests).
/// </summary>
private async Task SeedSmtpConfigAsync(int maxRetries = 5)
{
@@ -141,7 +144,7 @@ public class NotifyDispatcherAuditTrailTests : TestKit, IClassFixture<MsSqlMigra
"smtp.example.com", "Basic", "noreply@example.com")
{
MaxRetries = maxRetries,
RetryDelay = TimeSpan.Zero,
RetryDelay = TimeSpan.FromMilliseconds(1),
});
await ctx.SaveChangesAsync();
}
@@ -270,6 +270,131 @@ public class NotificationOutboxActorDispatchTests : TestKit
Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>()));
}
[Fact]
public void TransientFailure_WithZeroMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
{
// NO-002: SmtpConfiguration.MaxRetries=0 used to satisfy 1 >= 0 on the very first
// transient failure and park the row without a single retry. ResolveRetryPolicyAsync
// now clamps non-positive MaxRetries to the FallbackMaxRetries (10) so transient
// failures actually retry before parking.
SetupSmtpRetryPolicy(maxRetries: 0, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.RetryCount == 1 &&
n.NextAttemptAt != null &&
n.LastError == "smtp timeout"),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void TransientFailure_WithNegativeMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
{
// NO-002: a negative MaxRetries reaches ResolveRetryPolicyAsync just as -1 — same
// park-immediately bug. Clamp to FallbackMaxRetries.
SetupSmtpRetryPolicy(maxRetries: -1, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.RetryCount == 1 &&
n.NextAttemptAt != null &&
n.LastError == "smtp timeout"),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void TransientFailure_WithNonPositiveRetryDelay_UsesFallbackDelay_NotZero()
{
// NO-002: a non-positive RetryDelay would burn-loop the dispatcher because
// NextAttemptAt would equal now. Clamp to FallbackRetryDelay (1 min) so the
// schedule actually advances.
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.Zero);
var before = DateTimeOffset.UtcNow;
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.NextAttemptAt != null &&
n.NextAttemptAt > before + TimeSpan.FromSeconds(30)),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void PostStop_CancelsInFlightDelivery_LeavesRowNonTerminal()
{
// NO-003: the dispatcher used to drop the CancellationToken on its way into
// the channel adapter, so a coordinated shutdown had to wait the full SMTP
// connect/auth/send timeout per in-flight notification before the sweep
// finished. The actor now passes a lifecycle-scoped token; cancelling it on
// PostStop must abort the in-flight Task.Delay (standing in for an SMTP
// send) and the row must NOT be updated to a terminal state — the next
// active node picks it back up.
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification();
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
// Long delay simulates a slow SMTP send; the test triggers PostStop before
// the delay would naturally elapse, so the only way the delay completes is
// if the token wired through.
var adapter = new StubAdapter(
() => DeliveryOutcome.Success("ops@example.com"),
delay: TimeSpan.FromSeconds(30));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
// Wait until the adapter is actually in flight before stopping.
AwaitAssert(() => Assert.Equal(1, adapter.CallCount));
var start = DateTimeOffset.UtcNow;
Sys.Stop(actor);
// The sweep should observe cancellation promptly (well under the 30s delay).
AwaitAssert(
() =>
{
// No UpdateAsync was issued — the row is untouched and will be re-claimed
// by the next active node.
_outboxRepository.DidNotReceive().UpdateAsync(
Arg.Any<Notification>(), Arg.Any<CancellationToken>());
},
duration: TimeSpan.FromSeconds(5));
Assert.True(DateTimeOffset.UtcNow - start < TimeSpan.FromSeconds(5),
"PostStop did not cancel the in-flight delivery promptly.");
}
[Fact]
public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently()
{
@@ -1,3 +1,6 @@
using System.Text;
using MailKit.Security;
namespace ScadaLink.NotificationService.Tests;
/// <summary>
@@ -5,6 +8,8 @@ namespace ScadaLink.NotificationService.Tests;
/// silently skip authentication for a misconfigured SMTP config — a missing
/// credential, an unrecognised auth type, or an unparseable Basic credential
/// must be a hard, surfaced error rather than an unauthenticated send.
/// NS-021: the OAuth2 (XOAUTH2) branch must carry a non-empty user identity
/// (the SMTP From address) — an empty user is rejected by M365 with `535 5.7.3`.
/// </summary>
public class MailKitSmtpClientWrapperTests
{
@@ -42,4 +47,35 @@ public class MailKitSmtpClientWrapperTests
await Assert.ThrowsAsync<SmtpPermanentException>(
() => wrapper.AuthenticateAsync("basic", "nocolon"));
}
[Fact]
public async Task Authenticate_OAuth2WithoutUserName_Throws()
{
// NS-021: passing an OAuth2 access token but no user identity (FromAddress)
// used to construct `new SaslMechanismOAuth2("", credentials)`, which M365
// rejects with `535 5.7.3`. The wrapper now refuses upfront so the caller
// sees a clean configuration error rather than a confusing server reject.
var wrapper = new MailKitSmtpClientWrapper();
await Assert.ThrowsAsync<SmtpPermanentException>(
() => wrapper.AuthenticateAsync("oauth2", "access-token", oauth2UserName: null));
await Assert.ThrowsAsync<SmtpPermanentException>(
() => wrapper.AuthenticateAsync("oauth2", "access-token", oauth2UserName: ""));
}
[Fact]
public void XOAuth2InitialResponse_CarriesUserAndBearer()
{
// NS-021 regression guard: independent of the wrapper, prove that MailKit's
// SaslMechanismOAuth2 puts `user=<userName>` into the initial-response bytes
// — i.e. wiring the wrapper to pass `FromAddress` is sufficient to fix the
// M365 handshake. If MailKit ever changes the framing this test will catch it.
var sasl = new SaslMechanismOAuth2("noreply@example.com", "tok-xyz");
var initial = sasl.Challenge(string.Empty);
var asString = Encoding.UTF8.GetString(Convert.FromBase64String(initial));
Assert.Contains("user=noreply@example.com", asString);
Assert.Contains("auth=Bearer tok-xyz", asString);
}
}
@@ -115,7 +115,8 @@ public class NotificationDeliveryServiceTests
await _smtpClient.Received().ConnectAsync(
"smtp.example.com", 587, SmtpTlsMode.StartTls, Arg.Any<int>(), Arg.Any<CancellationToken>());
await _smtpClient.Received().AuthenticateAsync("basic", "user:pass", Arg.Any<CancellationToken>());
await _smtpClient.Received().AuthenticateAsync(
"basic", "user:pass", Arg.Any<string?>(), Arg.Any<CancellationToken>());
await _smtpClient.Received().SendAsync(
"noreply@example.com",
Arg.Is<IEnumerable<string>>(bcc => bcc.Count() == 2),
@@ -370,7 +371,7 @@ public class NotificationDeliveryServiceTests
public bool Disposed { get; private set; }
public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task SendAsync(string from, IEnumerable<string> bccRecipients, string subject, string body, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
@@ -435,7 +436,7 @@ public class NotificationDeliveryServiceTests
public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default)
=> _failOnAuthenticate != null ? Task.FromException(_failOnAuthenticate()) : Task.CompletedTask;
public Task SendAsync(string from, IEnumerable<string> bccRecipients, string subject, string body, CancellationToken cancellationToken = default)
@@ -496,7 +497,7 @@ public class NotificationDeliveryServiceTests
ConnectionTimeoutSeconds = connectionTimeoutSeconds;
return Task.CompletedTask;
}
public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task SendAsync(string from, IEnumerable<string> bccRecipients, string subject, string body, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
@@ -643,7 +644,7 @@ public class NotificationDeliveryServiceTests
public BlockingSmtpClient(Func<Task> onSend) => _onSend = onSend;
public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task SendAsync(string from, IEnumerable<string> bccRecipients, string subject, string body, CancellationToken cancellationToken = default)
=> _onSend();
@@ -721,17 +722,19 @@ public class NotificationDeliveryServiceTests
// ── NotificationService-012: OAuth2 delivery path coverage ──
/// <summary>An SMTP wrapper that records the auth type and credentials it received.</summary>
/// <summary>An SMTP wrapper that records the auth type, credentials, and OAuth2 user identity it received.</summary>
private sealed class RecordingAuthClient : ISmtpClientWrapper
{
public string? AuthType { get; private set; }
public string? Credentials { get; private set; }
public string? OAuth2UserName { get; private set; }
public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default)
=> Task.CompletedTask;
public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default)
public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default)
{
AuthType = authType;
Credentials = credentials;
OAuth2UserName = oauth2UserName;
return Task.CompletedTask;
}
public Task SendAsync(string from, IEnumerable<string> bccRecipients, string subject, string body, CancellationToken cancellationToken = default)
@@ -790,6 +793,9 @@ public class NotificationDeliveryServiceTests
Assert.True(result.Success);
Assert.Equal("oauth2", recording.AuthType);
Assert.Equal("oauth2-access-token-xyz", recording.Credentials);
// NS-021: OAuth2 SASL must carry the FromAddress as the user identity so
// the M365 XOAUTH2 handshake's `user=` field matches the token's mailbox.
Assert.Equal("noreply@example.com", recording.OAuth2UserName);
}
// ── NotificationService-015: unclassified exceptions must not escape SendAsync ──