fix(error-handling): close Theme 4 — 18 cancellation / fire-and-forget findings

Async cancellation hygiene, fire-and-forget observability, retry/shutdown
semantics, and audit-row coverage across 9 modules. Highlights:

Cancellation & lifecycle:
- AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the
  captured SyncContext that risked sync-over-async deadlock.
- AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS,
  threaded through drain paths instead of CancellationToken.None.
- Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls.
- Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM
  during the bounded-retry window aborts cleanly.

Cursor / retry / counter correctness:
- AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since`
  when any row's idempotent insert is still being retried (per-EventId
  retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical
  abandon). No more silent abandonment of permanently-failing rows.
- ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's
  SPLIT loop — by class-doc construction the catch could only mask real
  failures and let the next iteration create permanent partition holes.
- HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot
  per-interval counters before sending, restore via new
  ISiteHealthCollector.AddIntervalCounters on transport failure so counts
  aren't silently lost.

Fire-and-forget / shutdown waits:
- InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks
  via OnlyOnFaulted continuation (Warning log; response unchanged).
- SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep
  with a bounded SweepShutdownWaitTimeout (10s).

Leak / refactor:
- Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its
  own try/catch so a throw doesn't leak the relay actor or _activeStreams
  entry.
- Comm-022: VERIFIED already-closed by Comm-016's dead-code purge.
- CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync
  (auth-failure exit-code contract unified).

Defensive / validation:
- CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config
  prints a warning and returns defaults instead of crashing the CLI.
- Host-022: ParseLevel emits stderr one-shot warning for unrecognised
  MinimumLevel instead of silently coercing to Information.
- ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the
  per-call CTS is the sole timeout source (was clipped to 100s by .NET).
- Security-020: New SecurityOptionsValidator (IValidateOptions) rejects
  empty LdapServer/LdapSearchBase with ValidateOnStart.
- DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/
  DeleteTimedOut audit entries (mirrors DeployFailed pattern).

Plus reconciled stale per-module Open-findings counters that had drifted
from prior sessions.

20+ new regression tests across 11 test projects; build clean; affected
suites all green. README regenerated: 75 open (was 93).
This commit is contained in:
Joseph Doherty
2026-05-28 07:13:28 -04:00
parent 819f1b4665
commit 6ae0fea558
44 changed files with 1708 additions and 200 deletions
@@ -434,6 +434,10 @@ public class SecurityReviewRegressionTests
services.AddLogging();
services.AddDataProtection();
services.AddSecurity();
// Security-020: the cookie PostConfigure reads SecurityOptions.Value,
// which triggers SecurityOptionsValidator — supply the required LDAP
// fields so the cookie wiring under test can be resolved.
ConfigureValidLdapDefaults(services);
using var provider = services.BuildServiceProvider();
var cookieOptions = provider
@@ -446,6 +450,20 @@ public class SecurityReviewRegressionTests
Assert.True(cookieOptions.Cookie.HttpOnly);
}
/// <summary>
/// Security-020: supplies the minimum-valid LDAP fields so the cookie /
/// JWT wiring under test can be resolved without hitting
/// <see cref="SecurityOptionsValidator"/>. Used by the cookie-policy
/// integration tests in this class, which only care about the cookie
/// options shape — not the LDAP fields.
/// </summary>
private static void ConfigureValidLdapDefaults(IServiceCollection services) =>
services.Configure<SecurityOptions>(o =>
{
o.LdapServer = "ldap.example.com";
o.LdapSearchBase = "dc=example,dc=com";
});
// --- CentralUI-005: cookie auth must use a sliding session window ---
// Documented policy (CLAUDE.md Security & Auth): sliding refresh with a
// 30-minute idle timeout. The cookie middleware must enable SlidingExpiration
@@ -458,6 +476,7 @@ public class SecurityReviewRegressionTests
services.AddLogging();
services.AddDataProtection();
services.AddSecurity();
ConfigureValidLdapDefaults(services);
using var provider = services.BuildServiceProvider();
var cookieOptions = provider
@@ -474,6 +493,7 @@ public class SecurityReviewRegressionTests
services.AddLogging();
services.AddDataProtection();
services.AddSecurity();
ConfigureValidLdapDefaults(services);
// The idle timeout drives the cookie's expiry window.
services.Configure<SecurityOptions>(o => o.IdleTimeoutMinutes = 30);
@@ -492,6 +512,7 @@ public class SecurityReviewRegressionTests
services.AddLogging();
services.AddDataProtection();
services.AddSecurity();
ConfigureValidLdapDefaults(services);
services.Configure<SecurityOptions>(o => o.IdleTimeoutMinutes = 45);
using var provider = services.BuildServiceProvider();
@@ -1196,3 +1217,100 @@ public class AuthorizationPolicyTests
}
#endregion
#region Code Review Regression Tests Security-020
/// <summary>
/// Security-020: <see cref="SecurityOptionsValidator"/> must reject empty
/// <see cref="SecurityOptions.LdapServer"/> / <see cref="SecurityOptions.LdapSearchBase"/>
/// at startup with a clear, key-naming message, so a typo'd appsettings section
/// fails fast at boot instead of surfacing minutes/hours later as a generic
/// LDAP error on the first real login.
/// </summary>
public class SecurityOptionsValidatorTests
{
private static SecurityOptions ValidOptions() => new()
{
LdapServer = "ldap.example.com",
LdapSearchBase = "dc=example,dc=com",
};
[Fact]
public void Validate_AllRequiredFieldsSet_Succeeds()
{
var validator = new SecurityOptionsValidator();
var result = validator.Validate(name: null, ValidOptions());
Assert.True(result.Succeeded);
}
[Theory]
[InlineData("")]
[InlineData(" ")]
public void Validate_EmptyOrWhitespaceLdapServer_Fails(string ldapServer)
{
var options = ValidOptions();
options.LdapServer = ldapServer;
var validator = new SecurityOptionsValidator();
var result = validator.Validate(name: null, options);
Assert.True(result.Failed);
// Must name the full Section:Field key so the operator can find it.
Assert.Contains("Security:LdapServer", result.FailureMessage);
}
[Theory]
[InlineData("")]
[InlineData(" ")]
public void Validate_EmptyOrWhitespaceLdapSearchBase_Fails(string ldapSearchBase)
{
var options = ValidOptions();
options.LdapSearchBase = ldapSearchBase;
var validator = new SecurityOptionsValidator();
var result = validator.Validate(name: null, options);
Assert.True(result.Failed);
Assert.Contains("Security:LdapSearchBase", result.FailureMessage);
}
[Fact]
public void Validate_BothRequiredFieldsEmpty_ReportsBoth()
{
var options = new SecurityOptions
{
LdapServer = string.Empty,
LdapSearchBase = string.Empty,
};
var validator = new SecurityOptionsValidator();
var result = validator.Validate(name: null, options);
Assert.True(result.Failed);
// Both keys named — the operator should not need to re-run after
// fixing the first one to discover the second is also missing.
Assert.Contains("Security:LdapServer", result.FailureMessage);
Assert.Contains("Security:LdapSearchBase", result.FailureMessage);
}
[Fact]
public void AddSecurity_RegistersSecurityOptionsValidator()
{
var services = new ServiceCollection();
services.AddLogging();
services.AddDataProtection();
services.AddSecurity();
using var provider = services.BuildServiceProvider();
// The validator participates in IValidateOptions<SecurityOptions> —
// registration is the load-bearing wiring that makes Security-020
// ValidateOnStart() actually fire.
var validators = provider.GetServices<IValidateOptions<SecurityOptions>>().ToList();
Assert.Contains(validators, v => v is SecurityOptionsValidator);
}
}
#endregion