diff --git a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs
index 58f99f1..5a2b969 100644
--- a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs
+++ b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs
@@ -139,52 +139,72 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
_dispatching = true;
var now = DateTimeOffset.UtcNow;
- // RunDispatchPass swallows its own errors; the completion message is sent for both
- // success and failure so the guard is always cleared.
- RunDispatchPass(now).PipeTo(Self, success: () => InternalMessages.DispatchComplete.Instance);
+ // RunDispatchPass swallows its own errors, but the failure projection is kept as a
+ // belt-and-braces guard so even a faulted task still lowers the in-flight guard —
+ // otherwise the dispatcher would wedge permanently.
+ RunDispatchPass(now).PipeTo(
+ Self,
+ success: () => InternalMessages.DispatchComplete.Instance,
+ failure: ex =>
+ {
+ _logger.LogError(ex, "Dispatch sweep faulted unexpectedly.");
+ return InternalMessages.DispatchComplete.Instance;
+ });
}
///
/// Runs a single dispatch sweep: claims the due batch, resolves the retry policy, and
/// delivers each notification sequentially. Per-notification failures are caught and
- /// logged so one bad row never aborts the rest of the batch.
+ /// logged so one bad row never aborts the rest of the batch. The whole body is wrapped
+ /// in a try/catch so the returned task never faults — scope creation, service resolution,
+ /// and retry-policy resolution can all throw, and a faulted task would otherwise leave
+ /// the dispatcher's in-flight guard stuck and wedge the loop permanently.
///
private async Task RunDispatchPass(DateTimeOffset now)
{
- using var scope = _serviceProvider.CreateScope();
- var outboxRepository = scope.ServiceProvider.GetRequiredService();
- var notificationRepository = scope.ServiceProvider.GetRequiredService();
-
- IReadOnlyList due;
try
{
- due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize);
- }
- catch (Exception ex)
- {
- _logger.LogError(ex, "Dispatch sweep failed to claim due notifications.");
- return;
- }
+ using var scope = _serviceProvider.CreateScope();
+ var outboxRepository = scope.ServiceProvider.GetRequiredService();
+ var notificationRepository = scope.ServiceProvider.GetRequiredService();
- if (due.Count == 0)
- {
- return;
- }
-
- var (maxRetries, retryDelay) = await ResolveRetryPolicyAsync(notificationRepository);
-
- foreach (var notification in due)
- {
+ IReadOnlyList due;
try
{
- await DeliverOneAsync(notification, now, maxRetries, retryDelay, outboxRepository);
+ due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize);
}
catch (Exception ex)
{
- // Isolate per-notification failures so the remainder of the batch still runs.
- _logger.LogError(
- ex, "Dispatch failed for notification {NotificationId}.", notification.NotificationId);
+ _logger.LogError(ex, "Dispatch sweep failed to claim due notifications.");
+ return;
}
+
+ if (due.Count == 0)
+ {
+ return;
+ }
+
+ var (maxRetries, retryDelay) = await ResolveRetryPolicyAsync(notificationRepository);
+
+ foreach (var notification in due)
+ {
+ try
+ {
+ await DeliverOneAsync(notification, now, maxRetries, retryDelay, outboxRepository);
+ }
+ catch (Exception ex)
+ {
+ // Isolate per-notification failures so the remainder of the batch still runs.
+ _logger.LogError(
+ ex, "Dispatch failed for notification {NotificationId}.", notification.NotificationId);
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ // Scope/service resolution or retry-policy resolution faulted; swallow and log so
+ // the returned task completes normally and the in-flight guard is always cleared.
+ _logger.LogError(ex, "Dispatch sweep failed unexpectedly.");
}
}
diff --git a/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs b/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs
index dedfc79..0c73dd5 100644
--- a/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs
+++ b/tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs
@@ -254,6 +254,28 @@ public class NotificationOutboxActorDispatchTests : TestKit
});
}
+ [Fact]
+ public void FaultedDispatchPass_ClearsInFlightGuard_SoNextTickStillRuns()
+ {
+ SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1));
+ // GetDueAsync throws on every call: the dispatch pass's task could fault if the
+ // failure were not handled, which would leave _dispatching stuck true forever.
+ _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any())
+ .Returns>(_ => throw new InvalidOperationException("db down"));
+ var actor = CreateActor(new Dictionary());
+
+ // First tick: the pass faults internally but must still clear the in-flight guard.
+ actor.Tell(InternalMessages.DispatchTick.Instance);
+ AwaitAssert(() => _outboxRepository.Received(1).GetDueAsync(
+ Arg.Any(), Arg.Any(), Arg.Any()));
+
+ // Second tick after the first completes: if the guard had wedged, this would be
+ // dropped and GetDueAsync would still show only one call.
+ actor.Tell(InternalMessages.DispatchTick.Instance);
+ AwaitAssert(() => _outboxRepository.Received(2).GetDueAsync(
+ Arg.Any(), Arg.Any(), Arg.Any()));
+ }
+
[Fact]
public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently()
{