fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
@@ -96,6 +96,27 @@ public class SiteAuditReconciliationActor : ReceiveActor
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, bool> _stalled = new();
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-004: per-EventId retry counter for rows whose central insert
|
||||
/// threw. While a row keeps failing AND is below
|
||||
/// <see cref="MaxPermanentInsertAttempts"/>, the cursor is held back so the
|
||||
/// next reconciliation tick re-pulls and retries the row. Crossing the
|
||||
/// threshold logs Critical and permanently abandons the row (cursor
|
||||
/// advances past it) so a truly broken row cannot block all subsequent
|
||||
/// progress for a site. The counter is in-memory only — singleton restart
|
||||
/// resets it, which is safe because the cursor also resets on restart and
|
||||
/// the next tick re-pulls everything.
|
||||
/// </summary>
|
||||
private readonly Dictionary<Guid, int> _failedInsertAttempts = new();
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-004: number of consecutive central-insert failures before a row
|
||||
/// is permanently abandoned with a Critical log entry and the cursor is
|
||||
/// allowed to advance past it. Five attempts at the 5-minute default tick
|
||||
/// is ~25 min of retry budget before a stuck row stops blocking progress.
|
||||
/// </summary>
|
||||
private const int MaxPermanentInsertAttempts = 5;
|
||||
|
||||
private ICancelable? _timer;
|
||||
|
||||
/// <summary>
|
||||
@@ -232,9 +253,11 @@ public class SiteAuditReconciliationActor : ReceiveActor
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var maxOccurred = since;
|
||||
var hasUnresolvedFailure = false;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var evt in response.Events)
|
||||
{
|
||||
var advanceForThisRow = false;
|
||||
try
|
||||
{
|
||||
// Idempotent repository write: duplicate EventIds (from a
|
||||
@@ -243,29 +266,58 @@ public class SiteAuditReconciliationActor : ReceiveActor
|
||||
// InsertIfNotExistsAsync.
|
||||
var ingested = evt with { IngestedAtUtc = nowUtc };
|
||||
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
|
||||
_failedInsertAttempts.Remove(evt.EventId);
|
||||
advanceForThisRow = true;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Per-row catch so one bad event does not abandon the rest of
|
||||
// the batch. The cursor still advances based on OccurredAtUtc
|
||||
// — the row was returned by the site, so the next tick won't
|
||||
// re-fetch it; if it permanently fails to persist, that's an
|
||||
// operational concern surfaced by the log, not a hot-loop
|
||||
// trigger.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.",
|
||||
evt.EventId,
|
||||
site.SiteId);
|
||||
// AuditLog-004: per-row catch so one bad event does not abandon
|
||||
// the rest of the batch. Track the failure count per EventId —
|
||||
// below MaxPermanentInsertAttempts the cursor is HELD BACK so
|
||||
// the next tick re-pulls and retries; at the threshold the row
|
||||
// is permanently abandoned (LogCritical + cursor advances past)
|
||||
// to keep a truly broken row from blocking all subsequent
|
||||
// progress for the site.
|
||||
var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1;
|
||||
_failedInsertAttempts[evt.EventId] = attempts;
|
||||
|
||||
if (attempts >= MaxPermanentInsertAttempts)
|
||||
{
|
||||
_logger.LogCritical(
|
||||
ex,
|
||||
"Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.",
|
||||
evt.EventId,
|
||||
site.SiteId,
|
||||
attempts);
|
||||
_failedInsertAttempts.Remove(evt.EventId);
|
||||
advanceForThisRow = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.",
|
||||
evt.EventId,
|
||||
site.SiteId,
|
||||
attempts,
|
||||
MaxPermanentInsertAttempts);
|
||||
hasUnresolvedFailure = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (evt.OccurredAtUtc > maxOccurred)
|
||||
if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
|
||||
{
|
||||
maxOccurred = evt.OccurredAtUtc;
|
||||
}
|
||||
}
|
||||
|
||||
_cursors[site.SiteId] = maxOccurred;
|
||||
// AuditLog-004: only advance the persisted cursor if no event in this
|
||||
// batch is still being retried. Leaving the cursor at `since` re-pulls
|
||||
// the whole batch next tick — successful rows are no-ops thanks to
|
||||
// InsertIfNotExistsAsync's idempotency, and the failing row gets
|
||||
// another attempt. Once it succeeds (or hits the permanent-abandon
|
||||
// threshold) the cursor unblocks naturally.
|
||||
_cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred;
|
||||
|
||||
var nonDraining = response.MoreAvailable && response.Events.Count > 0;
|
||||
UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
|
||||
|
||||
Reference in New Issue
Block a user