fix(audit): robust central options binding + interval clamps + doc/contract fixes (review)

This commit is contained in:
Joseph Doherty
2026-06-15 10:11:49 -04:00
parent 36a08a4145
commit c092e89fd1
7 changed files with 153 additions and 40 deletions
@@ -17,8 +17,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// </para>
/// <para>
/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
/// milliseconds without polluting the production config surface; production
/// binds <see cref="IntervalHours"/> only.
/// milliseconds; production config is expected to set <see cref="IntervalHours"/>
/// only. Because this options class is <c>Bind</c>-ed wholesale, a config value
/// at <c>AuditLog:Purge:IntervalOverride</c> would bind if present (and would
/// bypass the <see cref="Interval"/> minimum clamp) — operators must not set it.
/// </para>
/// </remarks>
public sealed class AuditLogPurgeOptions
@@ -29,15 +31,44 @@ public sealed class AuditLogPurgeOptions
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-hour resolution allows. When non-null, takes precedence over
/// <see cref="IntervalHours"/>. Not bound from config — production
/// config exposes <see cref="IntervalHours"/> only.
/// <see cref="IntervalHours"/> AND bypasses the <see cref="Interval"/>
/// minimum clamp (so tests can use millisecond cadences). Production
/// config exposes <see cref="IntervalHours"/> only and never sets this
/// knob — but because the options class is <c>Bind</c>-ed wholesale, a
/// config value at <c>AuditLog:Purge:IntervalOverride</c> WOULD bind if
/// present; operators must not set it.
/// </summary>
public TimeSpan? IntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override
/// when set. Falls back to <see cref="IntervalHours"/>.
/// Minimum interval the config-bound <see cref="IntervalHours"/> can
/// resolve to. Clamps a misconfigured <c>IntervalHours: 0</c> (or a
/// negative value) away from <see cref="TimeSpan.Zero"/> — a zero
/// interval would make Akka's <c>ScheduleTellRepeatedlyCancelable</c>
/// spin, looping the partition drop/rebuild dance into a sustained SQL
/// outage. The test-only <see cref="IntervalOverride"/> bypasses this
/// clamp so unit tests can still drop the cadence to milliseconds.
/// </summary>
public TimeSpan Interval =>
IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromMinutes(1);
/// <summary>
/// Resolves the effective tick interval, honouring the test override
/// when set. Falls back to <see cref="IntervalHours"/>, clamped to at
/// least <see cref="MinConfiguredInterval"/> so a zero/negative config
/// value can never yield <see cref="TimeSpan.Zero"/> (which would spin
/// the scheduler).
/// </summary>
public TimeSpan Interval
{
get
{
if (IntervalOverride is { } overrideValue)
{
return overrideValue;
}
var resolved = TimeSpan.FromHours(IntervalHours);
return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved;
}
}
}
@@ -9,11 +9,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <remarks>
/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
/// site's configured <c>GrpcNodeAAddress</c> (falling back to
/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
/// configured are silently skipped — the reconciliation pull cannot reach
/// them, but absence of an address is a configuration decision, not a runtime
/// error.
/// site's configured <c>GrpcNodeAAddress</c>. This is a NodeA-only first cut:
/// sites with a blank <c>GrpcNodeAAddress</c> are silently SKIPPED — the
/// reconciliation pull cannot reach them, but absence of an address is a
/// configuration decision, not a runtime error. NodeB-fallback endpoint
/// selection (dial NodeB when NodeA is unset/unreachable) is a follow-up
/// (mirrors the comment in <c>SiteEnumerator.cs</c>).
/// </remarks>
public interface ISiteEnumerator
{
@@ -182,6 +182,10 @@ public class SiteAuditReconciliationActor : ReceiveActor
IReadOnlyList<SiteEntry> sites;
try
{
// No ambient CancellationToken in a ReceiveActor message handler —
// CancellationToken.None (the EnumerateAsync default) is intentional.
// The work is bounded by the 5-min reconciliation tick plus the
// 10s graceful-stop drain on PhaseClusterLeave.
sites = await _sites.EnumerateAsync().ConfigureAwait(false);
}
catch (Exception ex)
@@ -31,18 +31,45 @@ public sealed class SiteAuditReconciliationOptions
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-second resolution allows. When non-null, takes precedence over
/// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
/// production config exposes <see cref="ReconciliationIntervalSeconds"/>
/// only.
/// <see cref="ReconciliationIntervalSeconds"/> AND bypasses the
/// <see cref="ReconciliationInterval"/> minimum clamp (so tests can use
/// millisecond cadences). Production config exposes
/// <see cref="ReconciliationIntervalSeconds"/> only and never sets this
/// knob — but because the options class is <c>Bind</c>-ed wholesale, a
/// config value at <c>AuditLog:Reconciliation:ReconciliationIntervalOverride</c>
/// WOULD bind if present; operators must not set it.
/// </summary>
public TimeSpan? ReconciliationIntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override when
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
/// Minimum interval the config-bound <see cref="ReconciliationIntervalSeconds"/>
/// can resolve to. Clamps a misconfigured <c>ReconciliationIntervalSeconds: 0</c>
/// (or a negative value) away from <see cref="TimeSpan.Zero"/>, which would make
/// Akka's <c>ScheduleTellRepeatedlyCancelable</c> spin. The test-only
/// <see cref="ReconciliationIntervalOverride"/> bypasses this clamp so unit tests
/// can still drop the cadence to milliseconds.
/// </summary>
public TimeSpan ReconciliationInterval =>
ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromSeconds(1);
/// <summary>
/// Resolves the effective tick interval, honouring the test override when
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>, clamped to at
/// least <see cref="MinConfiguredInterval"/> so a zero/negative config value can
/// never yield <see cref="TimeSpan.Zero"/> (which would spin the scheduler).
/// </summary>
public TimeSpan ReconciliationInterval
{
get
{
if (ReconciliationIntervalOverride is { } overrideValue)
{
return overrideValue;
}
var resolved = TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved;
}
}
/// <summary>
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
@@ -67,7 +67,9 @@ public sealed class SiteEnumerator : ISiteEnumerator
continue;
}
entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress));
// The IsNullOrWhiteSpace guard above proves GrpcNodeAAddress is
// non-null here; explicit null-forgiving for clarity.
entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress!));
}
return entries;
@@ -333,6 +333,24 @@ public static class ServiceCollectionExtensions
.Bind(config.GetSection(PartitionMaintenanceSectionName));
services.AddHostedService<AuditLogPartitionMaintenanceService>();
// I1 (review): bind the two central-singleton options HERE rather than in
// AddAuditLogCentralReconciliationClient. AkkaHostedService.RegisterCentralActors
// resolves IOptions<AuditLogPurgeOptions> / <SiteAuditReconciliationOptions>
// via GetRequiredService when it wires the AuditLogPurgeActor +
// SiteAuditReconciliationActor singletons; AddAuditLogCentralMaintenance is
// ALWAYS called on the central path (the reconciliation-client helper is the
// one that could in principle be dropped), so binding the options here means
// the singletons get a valid IOptions even if the gRPC-client helper is not
// wired — instead of a cryptic InvalidOperationException at GetRequiredService.
// Defaults are fine when the section is absent (24 h purge cadence /
// 5 min reconciliation tick); production exposes IntervalHours /
// ReconciliationIntervalSeconds only — the test-only *Override knobs are
// not intended to be set from config (see the options classes' remarks).
services.AddOptions<AuditLogPurgeOptions>()
.Bind(config.GetSection(PurgeSectionName));
services.AddOptions<SiteAuditReconciliationOptions>()
.Bind(config.GetSection(ReconciliationSectionName));
// M6 Bundle E (T8 + T9): central health snapshot — a single object
// that owns the CentralAuditWriteFailures + AuditRedactionFailure
// Interlocked counters AND surfaces them on
@@ -397,19 +415,21 @@ public static class ServiceCollectionExtensions
/// </para>
/// <para>
/// The production <see cref="ISiteEnumerator"/> (<see cref="SiteEnumerator"/>,
/// wrapping the scoped <c>ISiteRepository</c>) IS registered here, alongside
/// the <see cref="AuditLogPurgeOptions"/> + <see cref="SiteAuditReconciliationOptions"/>
/// bindings — so the two central singletons wired in the Host
/// (<see cref="AuditLogPurgeActor"/> + <see cref="SiteAuditReconciliationActor"/>)
/// can resolve their collaborators + options from the same central-only
/// helper. Keeping the enumerator + options on this central path preserves
/// the "every <c>Add*</c> call is safe from any composition root" invariant:
/// a site host never calls this helper, so it never registers a
/// site-dialing enumerator.
/// wrapping the scoped <c>ISiteRepository</c>) IS registered here — so the
/// <see cref="SiteAuditReconciliationActor"/> singleton wired in the Host can
/// resolve its enumerator + gRPC client from this central-only helper. Keeping
/// the enumerator on this central path preserves the "every <c>Add*</c> call is
/// safe from any composition root" invariant: a site host never calls this
/// helper, so it never registers a site-dialing enumerator. The
/// <see cref="AuditLogPurgeOptions"/> + <see cref="SiteAuditReconciliationOptions"/>
/// bindings live in <see cref="AddAuditLogCentralMaintenance"/> instead (I1
/// review fix) — that helper is unconditionally called on the central path, so
/// the two maintenance singletons get a valid <c>IOptions</c> even if this
/// gRPC-client helper is ever dropped.
/// </para>
/// </remarks>
/// <param name="services">The service collection to register into.</param>
/// <param name="config">Application configuration used to bind the purge + reconciliation options sections.</param>
/// <param name="config">Application configuration used to bind the gRPC client's communication options (purge + reconciliation options are bound by <see cref="AddAuditLogCentralMaintenance"/>).</param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddAuditLogCentralReconciliationClient(
this IServiceCollection services,
@@ -425,15 +445,12 @@ public static class ServiceCollectionExtensions
// in SiteAuditReconciliationActor / AuditLogPurgeActor).
services.TryAddSingleton<ISiteEnumerator>(sp => new SiteEnumerator(sp));
// Bind the two central-singleton options to their config sections.
// Defaults are fine when the section is absent (24 h purge cadence /
// 5 min reconciliation tick); production exposes IntervalHours /
// ReconciliationIntervalSeconds only — the test-only *Override knobs
// are intentionally not bound.
services.AddOptions<AuditLogPurgeOptions>()
.Bind(config.GetSection(PurgeSectionName));
services.AddOptions<SiteAuditReconciliationOptions>()
.Bind(config.GetSection(ReconciliationSectionName));
// I1 (review): the AuditLogPurgeOptions / SiteAuditReconciliationOptions
// bindings moved to AddAuditLogCentralMaintenance — that helper is always
// called on the central path, so the two maintenance singletons resolve a
// valid IOptions even if this gRPC-client helper is ever dropped. Keep the
// ISiteEnumerator + gRPC client registrations here (they dial sites and are
// central-only by design).
// The invoker owns the per-endpoint GrpcChannel cache, so it must be a
// singleton — a fresh invoker per resolution would leak channels.