fix(notifications): close OAuth2 SMTP + dispatcher resilience gaps (5 findings)
NS-021/NO-001: thread FromAddress into XOAUTH2 so M365 stops rejecting sends with 535 5.7.3. Added an additive oauth2UserName parameter on ISmtpClientWrapper.AuthenticateAsync; both NotificationService and NotificationOutbox now pass config.FromAddress. NO-002: clamp non-positive SmtpConfiguration.MaxRetries/RetryDelay to the 1-min / 10-attempt fallback with a Warning so a misconfigured row no longer parks transient failures on the first attempt or burn-loops. NO-003: route a lifecycle-scoped CancellationToken from the NotificationOutboxActor through the dispatch sweep into the adapter so in-flight SMTP sends abort on PostStop instead of blocking CoordinatedShutdown for the full SMTP timeout per row. NO-004: await the central audit writer inside the existing try/catch instead of fire-and-forget so the audit task can't outlive the per-sweep DI scope and writer faults reach the operator log instead of being silently dropped. Two AuditLog integration tests seeded RetryDelay = TimeSpan.Zero to force immediate re-claim on the second tick; updated them to 1 ms so they keep the same intent without tripping the NO-002 clamp.
This commit is contained in:
@@ -270,6 +270,131 @@ public class NotificationOutboxActorDispatchTests : TestKit
|
||||
Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>()));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TransientFailure_WithZeroMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
|
||||
{
|
||||
// NO-002: SmtpConfiguration.MaxRetries=0 used to satisfy 1 >= 0 on the very first
|
||||
// transient failure and park the row without a single retry. ResolveRetryPolicyAsync
|
||||
// now clamps non-positive MaxRetries to the FallbackMaxRetries (10) so transient
|
||||
// failures actually retry before parking.
|
||||
SetupSmtpRetryPolicy(maxRetries: 0, retryDelay: TimeSpan.FromMinutes(1));
|
||||
var notification = MakeNotification(retryCount: 0);
|
||||
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new[] { notification });
|
||||
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
|
||||
var actor = CreateActor([adapter]);
|
||||
|
||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||
|
||||
AwaitAssert(() =>
|
||||
{
|
||||
_outboxRepository.Received(1).UpdateAsync(
|
||||
Arg.Is<Notification>(n =>
|
||||
n.Status == NotificationStatus.Retrying &&
|
||||
n.RetryCount == 1 &&
|
||||
n.NextAttemptAt != null &&
|
||||
n.LastError == "smtp timeout"),
|
||||
Arg.Any<CancellationToken>());
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TransientFailure_WithNegativeMaxRetries_RetriesUsingFallback_DoesNotParkImmediately()
|
||||
{
|
||||
// NO-002: a negative MaxRetries reaches ResolveRetryPolicyAsync just as -1 — same
|
||||
// park-immediately bug. Clamp to FallbackMaxRetries.
|
||||
SetupSmtpRetryPolicy(maxRetries: -1, retryDelay: TimeSpan.FromMinutes(1));
|
||||
var notification = MakeNotification(retryCount: 0);
|
||||
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new[] { notification });
|
||||
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
|
||||
var actor = CreateActor([adapter]);
|
||||
|
||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||
|
||||
AwaitAssert(() =>
|
||||
{
|
||||
_outboxRepository.Received(1).UpdateAsync(
|
||||
Arg.Is<Notification>(n =>
|
||||
n.Status == NotificationStatus.Retrying &&
|
||||
n.RetryCount == 1 &&
|
||||
n.NextAttemptAt != null &&
|
||||
n.LastError == "smtp timeout"),
|
||||
Arg.Any<CancellationToken>());
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TransientFailure_WithNonPositiveRetryDelay_UsesFallbackDelay_NotZero()
|
||||
{
|
||||
// NO-002: a non-positive RetryDelay would burn-loop the dispatcher because
|
||||
// NextAttemptAt would equal now. Clamp to FallbackRetryDelay (1 min) so the
|
||||
// schedule actually advances.
|
||||
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.Zero);
|
||||
var before = DateTimeOffset.UtcNow;
|
||||
var notification = MakeNotification(retryCount: 0);
|
||||
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new[] { notification });
|
||||
var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout"));
|
||||
var actor = CreateActor([adapter]);
|
||||
|
||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||
|
||||
AwaitAssert(() =>
|
||||
{
|
||||
_outboxRepository.Received(1).UpdateAsync(
|
||||
Arg.Is<Notification>(n =>
|
||||
n.Status == NotificationStatus.Retrying &&
|
||||
n.NextAttemptAt != null &&
|
||||
n.NextAttemptAt > before + TimeSpan.FromSeconds(30)),
|
||||
Arg.Any<CancellationToken>());
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PostStop_CancelsInFlightDelivery_LeavesRowNonTerminal()
|
||||
{
|
||||
// NO-003: the dispatcher used to drop the CancellationToken on its way into
|
||||
// the channel adapter, so a coordinated shutdown had to wait the full SMTP
|
||||
// connect/auth/send timeout per in-flight notification before the sweep
|
||||
// finished. The actor now passes a lifecycle-scoped token; cancelling it on
|
||||
// PostStop must abort the in-flight Task.Delay (standing in for an SMTP
|
||||
// send) and the row must NOT be updated to a terminal state — the next
|
||||
// active node picks it back up.
|
||||
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
|
||||
var notification = MakeNotification();
|
||||
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new[] { notification });
|
||||
// Long delay simulates a slow SMTP send; the test triggers PostStop before
|
||||
// the delay would naturally elapse, so the only way the delay completes is
|
||||
// if the token wired through.
|
||||
var adapter = new StubAdapter(
|
||||
() => DeliveryOutcome.Success("ops@example.com"),
|
||||
delay: TimeSpan.FromSeconds(30));
|
||||
var actor = CreateActor([adapter]);
|
||||
|
||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||
// Wait until the adapter is actually in flight before stopping.
|
||||
AwaitAssert(() => Assert.Equal(1, adapter.CallCount));
|
||||
|
||||
var start = DateTimeOffset.UtcNow;
|
||||
Sys.Stop(actor);
|
||||
|
||||
// The sweep should observe cancellation promptly (well under the 30s delay).
|
||||
AwaitAssert(
|
||||
() =>
|
||||
{
|
||||
// No UpdateAsync was issued — the row is untouched and will be re-claimed
|
||||
// by the next active node.
|
||||
_outboxRepository.DidNotReceive().UpdateAsync(
|
||||
Arg.Any<Notification>(), Arg.Any<CancellationToken>());
|
||||
},
|
||||
duration: TimeSpan.FromSeconds(5));
|
||||
|
||||
Assert.True(DateTimeOffset.UtcNow - start < TimeSpan.FromSeconds(5),
|
||||
"PostStop did not cancel the in-flight delivery promptly.");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user