fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
@@ -513,4 +513,73 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable
|
||||
// counts only sweep retries, so a freshly buffered message has RetryCount 0.
|
||||
Assert.Equal(0, msg.RetryCount);
|
||||
}
|
||||
|
||||
// ─── StoreAndForward-024: StopAsync waits for the in-flight sweep ───
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-024: <see cref="StoreAndForwardService.StopAsync"/> must
|
||||
/// not return until any in-flight retry sweep has completed (or the bounded
|
||||
/// shutdown timeout fires). Pre-fix it disposed the timer and returned
|
||||
/// immediately, leaving a mid-flight sweep touching disposed dependencies.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task StopAsync_AwaitsInFlightRetrySweep_BeforeReturning()
|
||||
{
|
||||
// Build a service whose timer fires almost immediately, with a handler
|
||||
// that pauses in the middle of delivery so we can observe StopAsync's
|
||||
// wait behaviour.
|
||||
var dbName = $"StopWait_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
|
||||
var options = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultRetryInterval = TimeSpan.Zero,
|
||||
DefaultMaxRetries = 3,
|
||||
// Fire almost immediately so the sweep is in-flight by the time we call StopAsync.
|
||||
RetryTimerInterval = TimeSpan.FromMilliseconds(20),
|
||||
};
|
||||
var service = new StoreAndForwardService(
|
||||
storage, options, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
// Pre-seed a buffered message so the sweep has work to do, and a
|
||||
// handler that blocks until we release it.
|
||||
var handlerEntered = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var releaseHandler = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var handlerCompleted = false;
|
||||
service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, async _ =>
|
||||
{
|
||||
handlerEntered.TrySetResult();
|
||||
await releaseHandler.Task;
|
||||
handlerCompleted = true;
|
||||
return true;
|
||||
});
|
||||
|
||||
var seed = await service.EnqueueAsync(
|
||||
StoreAndForwardCategory.ExternalSystem, "api", """{}""",
|
||||
attemptImmediateDelivery: false);
|
||||
Assert.True(seed.WasBuffered);
|
||||
|
||||
await service.StartAsync();
|
||||
// Wait until the timer-driven sweep has called into the handler.
|
||||
var entered = await Task.WhenAny(handlerEntered.Task, Task.Delay(TimeSpan.FromSeconds(2)));
|
||||
Assert.Same(handlerEntered.Task, entered);
|
||||
Assert.False(handlerCompleted, "Handler should still be paused inside the sweep.");
|
||||
|
||||
// Kick StopAsync — it must NOT return until the sweep finishes. Run the
|
||||
// release on a background task so we can prove StopAsync is awaiting.
|
||||
var stopTask = service.StopAsync();
|
||||
Assert.False(stopTask.IsCompleted,
|
||||
"StopAsync returned before the in-flight sweep was given a chance to finish.");
|
||||
|
||||
// Release the handler — StopAsync should now complete shortly.
|
||||
releaseHandler.SetResult();
|
||||
await stopTask.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
Assert.True(handlerCompleted,
|
||||
"Sweep handler must have finished before StopAsync returned.");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user