fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
@@ -689,4 +689,115 @@ public class AuditWriteMiddlewareTests
|
||||
$"ResponseSummary byte count {Encoding.UTF8.GetByteCount(evt.ResponseSummary!)} exceeded cap {cap}");
|
||||
Assert.True(evt.PayloadTruncated);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// InboundAPI-018: asynchronously faulted audit-write tasks must be
|
||||
// observed (logged) rather than silently dropped — but must still NOT
|
||||
// alter the user-facing HTTP response (alog.md §13).
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
/// <summary>
|
||||
/// Test-only writer whose <see cref="WriteAsync"/> returns a Task that
|
||||
/// faults AFTER an asynchronous boundary, so the throw happens after
|
||||
/// <see cref="AuditWriteMiddleware"/>'s synchronous try/catch can see it —
|
||||
/// exactly the fire-and-forget bug InboundAPI-018 closes.
|
||||
/// </summary>
|
||||
private sealed class AsyncFaultingAuditWriter : ICentralAuditWriter
|
||||
{
|
||||
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
return FaultAsync();
|
||||
|
||||
static async Task FaultAsync()
|
||||
{
|
||||
// Yield off-thread so the fault surfaces ASYNCHRONOUSLY (not
|
||||
// captured by a synchronous try/catch around the WriteAsync
|
||||
// call site).
|
||||
await Task.Yield();
|
||||
throw new InvalidOperationException("async audit write failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Captures log entries written through a <see cref="ILogger{TCategoryName}"/>
|
||||
/// so the test can assert on the Warning that
|
||||
/// <see cref="AuditWriteMiddleware.ObserveAuditWriteFault"/> emits.
|
||||
/// </summary>
|
||||
private sealed class RecordingLogger : Microsoft.Extensions.Logging.ILogger<AuditWriteMiddleware>
|
||||
{
|
||||
public List<(Microsoft.Extensions.Logging.LogLevel Level, string Message, Exception? Exception)> Entries { get; } = new();
|
||||
|
||||
public IDisposable BeginScope<TState>(TState state) where TState : notnull => NullScope.Instance;
|
||||
|
||||
public bool IsEnabled(Microsoft.Extensions.Logging.LogLevel logLevel) => true;
|
||||
|
||||
public void Log<TState>(
|
||||
Microsoft.Extensions.Logging.LogLevel logLevel,
|
||||
Microsoft.Extensions.Logging.EventId eventId,
|
||||
TState state,
|
||||
Exception? exception,
|
||||
Func<TState, Exception?, string> formatter)
|
||||
{
|
||||
lock (Entries)
|
||||
{
|
||||
Entries.Add((logLevel, formatter(state, exception), exception));
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class NullScope : IDisposable
|
||||
{
|
||||
public static readonly NullScope Instance = new();
|
||||
public void Dispose() { }
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AuditWriter_AsyncFault_IsObserved_AsWarning_AndDoesNotAlterResponse()
|
||||
{
|
||||
var writer = new AsyncFaultingAuditWriter();
|
||||
var logger = new RecordingLogger();
|
||||
var ctx = BuildContext();
|
||||
|
||||
var mw = new AuditWriteMiddleware(
|
||||
next: _ =>
|
||||
{
|
||||
ctx.Response.StatusCode = 200;
|
||||
return Task.CompletedTask;
|
||||
},
|
||||
auditWriter: writer,
|
||||
logger: logger,
|
||||
options: new StaticAuditLogOptionsMonitor(new AuditLogOptions()));
|
||||
|
||||
await mw.InvokeAsync(ctx);
|
||||
|
||||
// The user-facing response is untouched — audit emission is best-effort.
|
||||
Assert.Equal(200, ctx.Response.StatusCode);
|
||||
|
||||
// Give the off-thread continuation a moment to fire and log. Spin
|
||||
// briefly rather than sleep-then-assert so the test is resilient to
|
||||
// scheduler jitter without inflating runtime on success.
|
||||
var deadline = DateTime.UtcNow.AddSeconds(2);
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
lock (logger.Entries)
|
||||
{
|
||||
if (logger.Entries.Any(e =>
|
||||
e.Level == Microsoft.Extensions.Logging.LogLevel.Warning
|
||||
&& e.Exception is not null
|
||||
&& e.Message.Contains("async audit write faulted")))
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
await Task.Delay(20);
|
||||
}
|
||||
|
||||
// If we reach this point, the continuation did not fire — pre-fix the
|
||||
// fault would have been swallowed entirely and no log line emitted.
|
||||
var snapshot = logger.Entries.Select(e => $"{e.Level}: {e.Message}").ToList();
|
||||
Assert.Fail(
|
||||
"Expected a Warning log entry observing the async audit-write fault — none found. " +
|
||||
$"Entries: [{string.Join(", ", snapshot)}]");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user