test(sms): de-race NotificationOutbox load-flake under parallel-suite CPU oversubscription (S11)
This commit is contained in:
+85
-23
@@ -108,6 +108,29 @@ public class NotificationOutboxActorDispatchTests : TestKit
|
|||||||
.Returns(new[] { config });
|
.Returns(new[] { config });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Drives a sweep to completion by re-telling <see cref="InternalMessages.DispatchTick"/>
|
||||||
|
/// on every poll iteration until <paramref name="assertion"/> holds. This is the durable
|
||||||
|
/// barrier for chained-sweep tests: the in-flight guard (<c>_dispatching</c>) is only
|
||||||
|
/// lowered when a sweep's <see cref="InternalMessages.DispatchComplete"/> round-trips back
|
||||||
|
/// to the actor — AFTER the side-effect the assertion observes — so a single one-shot tick
|
||||||
|
/// can race the still-raised guard and be silently dropped (the dispatcher's intended
|
||||||
|
/// overlap protection), permanently stalling the count. A dropped tick is never retried, so
|
||||||
|
/// merely widening the wait cannot recover it. Re-telling each poll is idempotent (a tick
|
||||||
|
/// landing while the guard is up is harmlessly dropped) and the assertion's exact-count
|
||||||
|
/// strength is preserved unchanged.
|
||||||
|
/// </summary>
|
||||||
|
private void DrivePollingTick(IActorRef actor, Action assertion)
|
||||||
|
{
|
||||||
|
AwaitAssert(
|
||||||
|
() =>
|
||||||
|
{
|
||||||
|
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||||
|
assertion();
|
||||||
|
},
|
||||||
|
duration: TimeSpan.FromSeconds(10));
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void DispatchTick_ClaimsDueNotifications_AndInvokesAdapter()
|
public void DispatchTick_ClaimsDueNotifications_AndInvokesAdapter()
|
||||||
{
|
{
|
||||||
@@ -254,20 +277,48 @@ public class NotificationOutboxActorDispatchTests : TestKit
|
|||||||
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
|
SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
|
||||||
// GetDueAsync throws on every call: the dispatch pass's task could fault if the
|
// GetDueAsync throws on every call: the dispatch pass's task could fault if the
|
||||||
// failure were not handled, which would leave _dispatching stuck true forever.
|
// failure were not handled, which would leave _dispatching stuck true forever.
|
||||||
|
// Count every claim via a thread-safe counter so the assertions can reason about
|
||||||
|
// the number of sweeps that actually ran without depending on NSubstitute's
|
||||||
|
// exact-count matchers (see the de-race note below).
|
||||||
|
var claimAttempts = 0;
|
||||||
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
_outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||||
.Returns<IReadOnlyList<Notification>>(_ => throw new InvalidOperationException("db down"));
|
.Returns<IReadOnlyList<Notification>>(_ =>
|
||||||
|
{
|
||||||
|
Interlocked.Increment(ref claimAttempts);
|
||||||
|
throw new InvalidOperationException("db down");
|
||||||
|
});
|
||||||
var actor = CreateActor([]);
|
var actor = CreateActor([]);
|
||||||
|
|
||||||
// First tick: the pass faults internally but must still clear the in-flight guard.
|
// First tick: the pass faults internally but must still clear the in-flight guard.
|
||||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||||
AwaitAssert(() => _outboxRepository.Received(1).GetDueAsync(
|
AwaitAssert(
|
||||||
Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>()));
|
() => Assert.True(Volatile.Read(ref claimAttempts) >= 1),
|
||||||
|
duration: TimeSpan.FromSeconds(10));
|
||||||
|
|
||||||
// Second tick after the first completes: if the guard had wedged, this would be
|
// The guard must clear after the faulted sweep so a fresh tick runs another sweep.
|
||||||
// dropped and GetDueAsync would still show only one call.
|
//
|
||||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
// De-race (S11): the in-flight guard (_dispatching) is only lowered when the
|
||||||
AwaitAssert(() => _outboxRepository.Received(2).GetDueAsync(
|
// faulted sweep's DispatchComplete round-trips back to the actor — which happens
|
||||||
Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>()));
|
// AFTER the GetDueAsync side-effect the barrier above observes. So a single second
|
||||||
|
// tick sent right after that barrier can race the guard while it is still up and
|
||||||
|
// be silently dropped (the dispatcher's intended overlap protection), leaving the
|
||||||
|
// claim count stuck at one forever — a dropped tick is never retried, so widening
|
||||||
|
// the timeout cannot recover it. Re-tell the tick on every poll iteration instead:
|
||||||
|
// a tick that lands while the guard is still up is harmlessly dropped, and the
|
||||||
|
// first tick that lands after the guard lowers runs a fresh sweep. The assertion
|
||||||
|
// is "at least two sweeps ran" rather than "exactly two": that is strictly the
|
||||||
|
// wedge invariant under test (a wedged guard would pin the count at one no matter
|
||||||
|
// how many ticks arrive), and an at-least assertion is immune to the surplus ticks
|
||||||
|
// the re-tell driver may land under CPU starvation.
|
||||||
|
AwaitAssert(
|
||||||
|
() =>
|
||||||
|
{
|
||||||
|
actor.Tell(InternalMessages.DispatchTick.Instance);
|
||||||
|
Assert.True(
|
||||||
|
Volatile.Read(ref claimAttempts) >= 2,
|
||||||
|
"the in-flight guard wedged: a second dispatch tick never ran a fresh sweep");
|
||||||
|
},
|
||||||
|
duration: TimeSpan.FromSeconds(10));
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
@@ -410,8 +461,21 @@ public class NotificationOutboxActorDispatchTests : TestKit
|
|||||||
// so we don't mutate the shared _outboxRepository field that other tests in
|
// so we don't mutate the shared _outboxRepository field that other tests in
|
||||||
// this class configure differently.
|
// this class configure differently.
|
||||||
var outboxRepository = Substitute.For<INotificationOutboxRepository>();
|
var outboxRepository = Substitute.For<INotificationOutboxRepository>();
|
||||||
|
// De-race (S11): hand out a fresh due notification for the FIRST THREE claims, then
|
||||||
|
// an empty batch forever. This caps the deliverable work — and therefore the
|
||||||
|
// UpdateAsync count — at exactly three, no matter how many dispatch ticks the
|
||||||
|
// driver below fires. Without this ceiling the exact-count barrier is doubly
|
||||||
|
// brittle: a single one-shot tick that races the in-flight guard is silently
|
||||||
|
// dropped (stalling the count below three), while a tick-per-poll driver lets
|
||||||
|
// queued ticks pile up under CPU starvation and overshoot three. Capping the
|
||||||
|
// claimable batch lets the driver re-tell ticks idempotently — surplus sweeps
|
||||||
|
// claim nothing and never call UpdateAsync — so Received(3) is a stable target
|
||||||
|
// that, once reached, cannot be exceeded.
|
||||||
|
var dueClaims = 0;
|
||||||
outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
outboxRepository.GetDueAsync(Arg.Any<DateTimeOffset>(), Arg.Any<int>(), Arg.Any<CancellationToken>())
|
||||||
.Returns(_ => new[] { MakeNotification() });
|
.Returns(_ => Interlocked.Increment(ref dueClaims) <= 3
|
||||||
|
? new[] { MakeNotification() }
|
||||||
|
: Array.Empty<Notification>());
|
||||||
|
|
||||||
// Counting factory: increments each time the DI container resolves an
|
// Counting factory: increments each time the DI container resolves an
|
||||||
// INotificationDeliveryAdapter. Pre-fix this would have ticked once per
|
// INotificationDeliveryAdapter. Pre-fix this would have ticked once per
|
||||||
@@ -433,22 +497,20 @@ public class NotificationOutboxActorDispatchTests : TestKit
|
|||||||
new NoOpCentralAuditWriter(),
|
new NoOpCentralAuditWriter(),
|
||||||
NullLogger<NotificationOutboxActor>.Instance)));
|
NullLogger<NotificationOutboxActor>.Instance)));
|
||||||
|
|
||||||
// Fire three sweeps end-to-end. Each waits on the previous via the
|
// Drive dispatch sweeps until all three claimable notifications have been
|
||||||
// in-flight guard, so the UpdateAsync count climbs monotonically.
|
// delivered (Received(3).UpdateAsync). DrivePollingTick re-tells the tick on
|
||||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
// every poll iteration; the GetDueAsync cap above makes that idempotent (a
|
||||||
AwaitAssert(() => outboxRepository.Received(1).UpdateAsync(
|
// surplus sweep claims an empty batch and calls UpdateAsync zero times), so
|
||||||
|
// the count climbs to exactly three and then holds there — Received(3) is a
|
||||||
|
// stable ceiling rather than a fragile one-shot target. This proves multiple
|
||||||
|
// dispatch sweeps ran across the actor's lifetime, which is the precondition
|
||||||
|
// for the resolve-adapters-once assertion that follows.
|
||||||
|
DrivePollingTick(actor, () => outboxRepository.Received(3).UpdateAsync(
|
||||||
Arg.Any<Notification>(), Arg.Any<CancellationToken>()));
|
Arg.Any<Notification>(), Arg.Any<CancellationToken>()));
|
||||||
|
|
||||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
// The adapter resolution must have happened EXACTLY ONCE despite the multiple
|
||||||
AwaitAssert(() => outboxRepository.Received(2).UpdateAsync(
|
// dispatch sweeps just driven. Pre-fix this would have been once per sweep.
|
||||||
Arg.Any<Notification>(), Arg.Any<CancellationToken>()));
|
// This is the real invariant under test (NotificationOutbox-006).
|
||||||
|
|
||||||
actor.Tell(InternalMessages.DispatchTick.Instance);
|
|
||||||
AwaitAssert(() => outboxRepository.Received(3).UpdateAsync(
|
|
||||||
Arg.Any<Notification>(), Arg.Any<CancellationToken>()));
|
|
||||||
|
|
||||||
// The adapter resolution must have happened EXACTLY ONCE despite three
|
|
||||||
// dispatch sweeps. Pre-fix this would have been 3 (or more).
|
|
||||||
Assert.Equal(1, resolutionCount);
|
Assert.Equal(1, resolutionCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user