fix(notifications): close OAuth2 SMTP + dispatcher resilience gaps (5 findings)

NS-021/NO-001: thread FromAddress into XOAUTH2 so M365 stops rejecting
sends with 535 5.7.3. Added an additive oauth2UserName parameter on
ISmtpClientWrapper.AuthenticateAsync; both NotificationService and
NotificationOutbox now pass config.FromAddress.

NO-002: clamp non-positive SmtpConfiguration.MaxRetries/RetryDelay to the
1-min / 10-attempt fallback with a Warning so a misconfigured row no
longer parks transient failures on the first attempt or burn-loops.

NO-003: route a lifecycle-scoped CancellationToken from the
NotificationOutboxActor through the dispatch sweep into the adapter so
in-flight SMTP sends abort on PostStop instead of blocking
CoordinatedShutdown for the full SMTP timeout per row.

NO-004: await the central audit writer inside the existing try/catch
instead of fire-and-forget so the audit task can't outlive the per-sweep
DI scope and writer faults reach the operator log instead of being
silently dropped.

Two AuditLog integration tests seeded RetryDelay = TimeSpan.Zero to force
immediate re-claim on the second tick; updated them to 1 ms so they keep
the same intent without tripping the NO-002 clamp.
This commit is contained in:
Joseph Doherty
2026-05-28 03:54:43 -04:00
parent e536178323
commit 291274ae76
13 changed files with 370 additions and 61 deletions
@@ -270,6 +270,131 @@ public class NotificationOutboxActorDispatchTests : TestKit
Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>()));
}
[Fact]
public void TransientFailure_WithZeroMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
{
// NO-002: SmtpConfiguration.MaxRetries=0 used to satisfy 1 >= 0 on the very first
// transient failure and park the row without a single retry. ResolveRetryPolicyAsync
// now clamps non-positive MaxRetries to the FallbackMaxRetries (10) so transient
// failures actually retry before parking.
SetupSmtpRetryPolicy(maxRetries: 0, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.RetryCount == 1 &&
n.NextAttemptAt != null &&
n.LastError == "smtp timeout"),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void TransientFailure_WithNegativeMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
{
// NO-002: a negative MaxRetries reaches ResolveRetryPolicyAsync just as -1 — same
// park-immediately bug. Clamp to FallbackMaxRetries.
SetupSmtpRetryPolicy(maxRetries: -1, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.RetryCount == 1 &&
n.NextAttemptAt != null &&
n.LastError == "smtp timeout"),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void TransientFailure_WithNonPositiveRetryDelay_UsesFallbackDelay_NotZero()
{
// NO-002: a non-positive RetryDelay would burn-loop the dispatcher because
// NextAttemptAt would equal now. Clamp to FallbackRetryDelay (1 min) so the
// schedule actually advances.
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.Zero);
var before = DateTimeOffset.UtcNow;
var notification = MakeNotification(retryCount: 0);
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
AwaitAssert(() =>
{
_outboxRepository.Received(1).UpdateAsync(
Arg.Is<Notification>(n =>
n.Status == NotificationStatus.Retrying &&
n.NextAttemptAt != null &&
n.NextAttemptAt > before + TimeSpan.FromSeconds(30)),
Arg.Any<CancellationToken>());
});
}
[Fact]
public void PostStop_CancelsInFlightDelivery_LeavesRowNonTerminal()
{
// NO-003: the dispatcher used to drop the CancellationToken on its way into
// the channel adapter, so a coordinated shutdown had to wait the full SMTP
// connect/auth/send timeout per in-flight notification before the sweep
// finished. The actor now passes a lifecycle-scoped token; cancelling it on
// PostStop must abort the in-flight Task.Delay (standing in for an SMTP
// send) and the row must NOT be updated to a terminal state — the next
// active node picks it back up.
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
var notification = MakeNotification();
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
.Returns(new[] { notification });
// Long delay simulates a slow SMTP send; the test triggers PostStop before
// the delay would naturally elapse, so the only way the delay completes is
// if the token wired through.
var adapter = new StubAdapter(
() => DeliveryOutcome.Success("ops@example.com"),
delay: TimeSpan.FromSeconds(30));
var actor = CreateActor([adapter]);
actor.Tell(InternalMessages.DispatchTick.Instance);
// Wait until the adapter is actually in flight before stopping.
AwaitAssert(() => Assert.Equal(1, adapter.CallCount));
var start = DateTimeOffset.UtcNow;
Sys.Stop(actor);
// The sweep should observe cancellation promptly (well under the 30s delay).
AwaitAssert(
() =>
{
// No UpdateAsync was issued — the row is untouched and will be re-claimed
// by the next active node.
_outboxRepository.DidNotReceive().UpdateAsync(
Arg.Any<Notification>(), Arg.Any<CancellationToken>());
},
duration: TimeSpan.FromSeconds(5));
Assert.True(DateTimeOffset.UtcNow - start < TimeSpan.FromSeconds(5),
"PostStop did not cancel the in-flight delivery promptly.");
}
[Fact]
public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently()
{