diff --git a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs index 58f99f1..5a2b969 100644 --- a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs +++ b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs @@ -139,52 +139,72 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers _dispatching = true; var now = DateTimeOffset.UtcNow; - // RunDispatchPass swallows its own errors; the completion message is sent for both - // success and failure so the guard is always cleared. - RunDispatchPass(now).PipeTo(Self, success: () => InternalMessages.DispatchComplete.Instance); + // RunDispatchPass swallows its own errors, but the failure projection is kept as a + // belt-and-braces guard so even a faulted task still lowers the in-flight guard — + // otherwise the dispatcher would wedge permanently. + RunDispatchPass(now).PipeTo( + Self, + success: () => InternalMessages.DispatchComplete.Instance, + failure: ex => + { + _logger.LogError(ex, "Dispatch sweep faulted unexpectedly."); + return InternalMessages.DispatchComplete.Instance; + }); } /// /// Runs a single dispatch sweep: claims the due batch, resolves the retry policy, and /// delivers each notification sequentially. Per-notification failures are caught and - /// logged so one bad row never aborts the rest of the batch. + /// logged so one bad row never aborts the rest of the batch. The whole body is wrapped + /// in a try/catch so the returned task never faults — scope creation, service resolution, + /// and retry-policy resolution can all throw, and a faulted task would otherwise leave + /// the dispatcher's in-flight guard stuck and wedge the loop permanently. /// private async Task RunDispatchPass(DateTimeOffset now) { - using var scope = _serviceProvider.CreateScope(); - var outboxRepository = scope.ServiceProvider.GetRequiredService(); - var notificationRepository = scope.ServiceProvider.GetRequiredService(); - - IReadOnlyList due; try { - due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize); - } - catch (Exception ex) - { - _logger.LogError(ex, "Dispatch sweep failed to claim due notifications."); - return; - } + using var scope = _serviceProvider.CreateScope(); + var outboxRepository = scope.ServiceProvider.GetRequiredService(); + var notificationRepository = scope.ServiceProvider.GetRequiredService(); - if (due.Count == 0) - { - return; - } - - var (maxRetries, retryDelay) = await ResolveRetryPolicyAsync(notificationRepository); - - foreach (var notification in due) - { + IReadOnlyList due; try { - await DeliverOneAsync(notification, now, maxRetries, retryDelay, outboxRepository); + due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize); } catch (Exception ex) { - // Isolate per-notification failures so the remainder of the batch still runs. - _logger.LogError( - ex, "Dispatch failed for notification {NotificationId}.", notification.NotificationId); + _logger.LogError(ex, "Dispatch sweep failed to claim due notifications."); + return; } + + if (due.Count == 0) + { + return; + } + + var (maxRetries, retryDelay) = await ResolveRetryPolicyAsync(notificationRepository); + + foreach (var notification in due) + { + try + { + await DeliverOneAsync(notification, now, maxRetries, retryDelay, outboxRepository); + } + catch (Exception ex) + { + // Isolate per-notification failures so the remainder of the batch still runs. + _logger.LogError( + ex, "Dispatch failed for notification {NotificationId}.", notification.NotificationId); + } + } + } + catch (Exception ex) + { + // Scope/service resolution or retry-policy resolution faulted; swallow and log so + // the returned task completes normally and the in-flight guard is always cleared. + _logger.LogError(ex, "Dispatch sweep failed unexpectedly."); } } diff --git a/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs b/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs index dedfc79..0c73dd5 100644 --- a/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs +++ b/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs @@ -254,6 +254,28 @@ public class NotificationOutboxActorDispatchTests : TestKit }); } + [Fact] + public void FaultedDispatchPass_ClearsInFlightGuard_SoNextTickStillRuns() + { + SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1)); + // GetDueAsync throws on every call: the dispatch pass's task could fault if the + // failure were not handled, which would leave _dispatching stuck true forever. + _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns>(_ => throw new InvalidOperationException("db down")); + var actor = CreateActor(new Dictionary()); + + // First tick: the pass faults internally but must still clear the in-flight guard. + actor.Tell(InternalMessages.DispatchTick.Instance); + AwaitAssert(() => _outboxRepository.Received(1).GetDueAsync( + Arg.Any(), Arg.Any(), Arg.Any())); + + // Second tick after the first completes: if the guard had wedged, this would be + // dropped and GetDueAsync would still show only one call. + actor.Tell(InternalMessages.DispatchTick.Instance); + AwaitAssert(() => _outboxRepository.Received(2).GetDueAsync( + Arg.Any(), Arg.Any(), Arg.Any())); + } + [Fact] public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently() {