fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
@@ -1007,6 +1007,151 @@ public class DeploymentServiceTests : TestKit
|
||||
public SilentProbeActor() => ReceiveAny(_ => { });
|
||||
}
|
||||
|
||||
// ── DeploymentManager-019: lifecycle timeouts must write an audit entry ──
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: when a Disable times out at the site, the
|
||||
/// operator's attempted action must still be recorded in the audit log
|
||||
/// with the documented <c>DisableTimedOut</c> action — pre-fix nothing
|
||||
/// was written and the audit trail was silent about the attempt.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DisableInstanceAsync_LifecycleTimeout_WritesDisableTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("TimeoutAuditInst")
|
||||
{
|
||||
Id = 61,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Enabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(61, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var deadline = TimeSpan.FromMilliseconds(300);
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = deadline,
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.DisableInstanceAsync(61, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
|
||||
// The DisableTimedOut audit entry must have been written. Pre-fix the
|
||||
// catch block returned without calling _auditService at all.
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"DisableTimedOut",
|
||||
"Instance",
|
||||
"61",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: same audit guarantee for the Enable path.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task EnableInstanceAsync_LifecycleTimeout_WritesEnableTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("EnableTimeoutInst")
|
||||
{
|
||||
Id = 62,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Disabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(62, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300),
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.EnableInstanceAsync(62, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"EnableTimedOut",
|
||||
"Instance",
|
||||
"62",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-019: same audit guarantee for the Delete path.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DeleteInstanceAsync_LifecycleTimeout_WritesDeleteTimedOutAuditEntry()
|
||||
{
|
||||
var instance = new Instance("DeleteTimeoutInst")
|
||||
{
|
||||
Id = 63,
|
||||
SiteId = 1,
|
||||
State = InstanceState.Enabled,
|
||||
};
|
||||
_repo.GetInstanceByIdAsync(63, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() => new SilentProbeActor()));
|
||||
var comms = new CommunicationService(
|
||||
Options.Create(new CommunicationOptions { LifecycleTimeout = TimeSpan.FromSeconds(30) }),
|
||||
NullLogger<CommunicationService>.Instance);
|
||||
comms.SetCommunicationActor(commActor);
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
var service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
new DeploymentStatusNotifier(NullLogger<DeploymentStatusNotifier>.Instance),
|
||||
Options.Create(new DeploymentManagerOptions
|
||||
{
|
||||
OperationLockTimeout = TimeSpan.FromSeconds(5),
|
||||
LifecycleCommandTimeout = TimeSpan.FromMilliseconds(300),
|
||||
}),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
|
||||
var result = await service.DeleteInstanceAsync(63, "operator-jane");
|
||||
|
||||
Assert.True(result.IsFailure);
|
||||
await _audit.Received(1).LogAsync(
|
||||
"operator-jane",
|
||||
"DeleteTimedOut",
|
||||
"Instance",
|
||||
"63",
|
||||
instance.UniqueName,
|
||||
Arg.Any<object?>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
// ── DeploymentManager-003: post-success persistence must commit the Success status ──
|
||||
|
||||
[Fact]
|
||||
|
||||
Reference in New Issue
Block a user