Files
ScadaBridge/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs
T
Joseph Doherty 23c0fd417e feat(health): AuditRedactionFailure counter + bridge (#23 M5)
Bundle C task M5-T7 — surface DefaultAuditPayloadFilter redactor
over-redactions as a Site Health metric so a misconfigured /
catastrophic regex shows up on /monitoring/health rather than
disappearing into a NoOp sink.

  - SiteHealthReport: new 'AuditRedactionFailure' int field
    (defaulted to 0 for back-compat with existing producers/tests).
  - ISiteHealthCollector / SiteHealthCollector:
    new IncrementAuditRedactionFailure() — per-interval atomic
    counter with Interlocked, reset on CollectReport, mirroring
    the M2 Bundle G SiteAuditWriteFailures pattern.
  - HealthMetricsAuditRedactionFailureCounter: new bridge in
    ScadaLink.AuditLog.Site that forwards IAuditRedactionFailureCounter
    increments to ISiteHealthCollector — mirrors
    HealthMetricsAuditWriteFailureCounter one-for-one.
  - AddAuditLogHealthMetricsBridge: now ALSO Replaces the
    NoOpAuditRedactionFailureCounter binding with the health-metrics
    bridge, so a single AddAuditLogHealthMetricsBridge() call wires
    both the M2 Bundle G write-failure counter and the M5 Bundle C
    redaction-failure counter into the health report.

Site-side only for M5 — the filter also runs on CentralAuditWriter
and AuditLogIngestActor (where it just keeps the NoOp default), but
a central-side health-metric surface for AuditRedactionFailure is
deferred to M6 alongside the rest of the central health collector
work.

Tests:
  - AuditRedactionFailureMetricTests (HealthMonitoring) covers the
    SiteHealthCollector increment/report/reset shape (3 tests).
  - HealthMetricsAuditRedactionFailureCounterTests (AuditLog) covers
    the AuditLog → HealthMonitoring bridge (3 tests).
  - Existing CountCapturingHealthCollector stub in
    DeploymentManagerRedeployTests extended with the new no-op
    interface method.

Verified: dotnet build clean, all 24 test projects green
(the only Failed at first ScadaLink.SiteRuntime.Tests run was the
known-flaky InstanceActorChildAttributeRaceTests; passes on re-run
in isolation and full suite, unrelated to these changes).
2026-05-20 17:28:33 -04:00

220 lines
12 KiB
C#

using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.AuditLog.Central;
using ScadaLink.AuditLog.Configuration;
using ScadaLink.AuditLog.Payload;
using ScadaLink.AuditLog.Site;
using ScadaLink.AuditLog.Site.Telemetry;
using ScadaLink.Commons.Interfaces.Services;
namespace ScadaLink.AuditLog;
/// <summary>
/// Composition root for the Audit Log (#23) component.
/// </summary>
/// <remarks>
/// <para>
/// M1 registered <see cref="AuditLogOptions"/> + the validator. M2 Bundle E
/// extends the surface with the site-side writer chain
/// (<see cref="SqliteAuditWriter"/> + <see cref="RingBufferFallback"/> +
/// <see cref="FallbackAuditWriter"/>) and the telemetry collaborators
/// (<see cref="ISiteAuditQueue"/>, <see cref="ISiteStreamAuditClient"/>,
/// <see cref="IAuditWriteFailureCounter"/>, <see cref="SiteAuditTelemetryOptions"/>,
/// <see cref="SqliteAuditWriterOptions"/>).
/// </para>
/// <para>
/// Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call
/// Audit (#22). <c>IAuditLogRepository</c> is registered by
/// <c>ScadaLink.ConfigurationDatabase.ServiceCollectionExtensions.AddConfigurationDatabase</c>,
/// so the caller (the Host on the central node) must also call that.
/// </para>
/// </remarks>
public static class ServiceCollectionExtensions
{
/// <summary>Configuration section bound to <see cref="AuditLogOptions"/>.</summary>
public const string ConfigSectionName = "AuditLog";
/// <summary>Configuration section bound to <see cref="SqliteAuditWriterOptions"/>.</summary>
public const string SiteWriterSectionName = "AuditLog:SiteWriter";
/// <summary>Configuration section bound to <see cref="SiteAuditTelemetryOptions"/>.</summary>
public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry";
/// <summary>
/// Registers the Audit Log (#23) component services: options, the site
/// SQLite writer chain (primary + ring fallback + failure-counter sink),
/// and the site-→central telemetry collaborators. Idempotent re-registration
/// is not supported; call this exactly once per <see cref="IServiceCollection"/>.
/// </summary>
public static IServiceCollection AddAuditLog(this IServiceCollection services, IConfiguration config)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(config);
// M1: top-level AuditLogOptions + validator (redaction policy, payload caps, etc.).
services.AddOptions<AuditLogOptions>()
.Bind(config.GetSection(ConfigSectionName))
.ValidateOnStart();
services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();
// M5 Bundle A: payload filter — truncates oversized RequestSummary /
// ResponseSummary / ErrorDetail / Extra fields between event
// construction and persistence. Bundle B layers header / body /
// SQL-parameter redaction onto the same singleton; Bundle C wires it
// into the FallbackAuditWriter / CentralAuditWriter / IngestActor
// paths. Singleton — the filter is stateless and the IOptionsMonitor
// dependency picks up M5-T8 hot reloads on its own.
services.AddSingleton<IAuditPayloadFilter, DefaultAuditPayloadFilter>();
// M5 Bundle B: per-stage redactor-failure counter. NoOp default;
// Bundle C replaces this binding with the Site Health Monitoring
// bridge that surfaces failures as AuditRedactionFailure on the site
// health report.
services.TryAddSingleton<IAuditRedactionFailureCounter, NoOpAuditRedactionFailureCounter>();
// M2 Bundle E: site writer + telemetry options bindings.
// BindConfiguration is not used because the configuration root supplied
// by the caller may not be the application root — we go through the
// section explicitly so a partial IConfiguration (e.g. a test stub
// anchored on the AuditLog section's parent) still works.
services.AddOptions<SqliteAuditWriterOptions>()
.Bind(config.GetSection(SiteWriterSectionName));
services.AddOptions<SiteAuditTelemetryOptions>()
.Bind(config.GetSection(SiteTelemetrySectionName));
// SqliteAuditWriter is a singleton with a single owned SqliteConnection
// and a background writer Task; multiple instances would race on the
// same file. Registered concretely so the ISiteAuditQueue + IAuditWriter
// forwards below resolve to the same instance — the actor must observe
// the writes made via the hot-path interface.
services.AddSingleton<SqliteAuditWriter>();
services.AddSingleton<ISiteAuditQueue>(sp => sp.GetRequiredService<SqliteAuditWriter>());
// RingBufferFallback: drop-oldest in-memory ring used by
// FallbackAuditWriter when the primary SQLite writer throws. Default
// capacity is fine for M2 (1024).
services.AddSingleton<RingBufferFallback>();
// IAuditWriteFailureCounter: NoOp default. Bundle G overrides this
// binding with the real Site Health Monitoring counter. Registered
// before FallbackAuditWriter so the factory can resolve it.
services.AddSingleton<IAuditWriteFailureCounter, NoOpAuditWriteFailureCounter>();
// The script-thread surface is FallbackAuditWriter (primary + ring +
// counter), not the raw SqliteAuditWriter — primary failures must NEVER
// abort the user-facing action.
// Bundle C (M5-T6): the IAuditPayloadFilter singleton above is wired
// through the factory so every event written through this surface is
// truncated + redacted before it hits SQLite (and the ring on
// failure).
services.AddSingleton<IAuditWriter>(sp => new FallbackAuditWriter(
primary: sp.GetRequiredService<SqliteAuditWriter>(),
ring: sp.GetRequiredService<RingBufferFallback>(),
failureCounter: sp.GetRequiredService<IAuditWriteFailureCounter>(),
logger: sp.GetRequiredService<ILogger<FallbackAuditWriter>>(),
filter: sp.GetRequiredService<IAuditPayloadFilter>()));
// ISiteStreamAuditClient: NoOp default. M6's reconciliation work brings
// the real gRPC-backed implementation (no site→central gRPC channel
// exists today — sites talk to central via Akka ClusterClient only).
// Bundle H's integration test substitutes a stub directly into the
// SiteAuditTelemetryActor's Props.Create call.
services.AddSingleton<ISiteStreamAuditClient, NoOpSiteStreamAuditClient>();
// M3 Bundle F: site-side dual emitter for cached-call lifecycle
// telemetry. ScriptRuntimeContext.ExternalSystem.CachedCall /
// Database.CachedWrite resolves this through DI and pushes one combined
// packet per lifecycle event; the forwarder writes the audit half
// through IAuditWriter and the operational half through the
// IOperationTrackingStore. The audit writer is always wired (the M2
// chain above); the operational tracking store is SITE-ONLY (registered
// by ScadaLink.SiteRuntime). On a Central composition root the tracking
// store has no registration, so the factory resolves it with GetService
// (returning null) — the forwarder degrades to "audit-only" emission,
// mirroring the lazy IAuditWriter chain established in M2.
services.AddSingleton<ICachedCallTelemetryForwarder>(sp =>
new CachedCallTelemetryForwarder(
sp.GetRequiredService<IAuditWriter>(),
sp.GetService<ScadaLink.Commons.Interfaces.IOperationTrackingStore>(),
sp.GetRequiredService<ILogger<CachedCallTelemetryForwarder>>()));
// M3 Bundle F: bridge the store-and-forward retry-loop observer hook
// to the cached-call forwarder so per-attempt + terminal telemetry
// emitted from the S&F retry sweep lands on the same SQLite hot-path
// as the script-thread CachedSubmit row. Registered as a singleton
// and also bound to ICachedCallLifecycleObserver so AddStoreAndForward
// can resolve it through DI (Bundle F StoreAndForward wiring change).
services.AddSingleton<CachedCallLifecycleBridge>();
services.AddSingleton<ICachedCallLifecycleObserver>(
sp => sp.GetRequiredService<CachedCallLifecycleBridge>());
// M4 Bundle B: central direct-write audit writer used by
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
// emit AuditLog rows that originate ON central, not via site telemetry.
// Singleton — the writer is stateless; its per-call scope opens a fresh
// IAuditLogRepository (a SCOPED EF Core service registered by
// ScadaLink.ConfigurationDatabase). The interface (ICentralAuditWriter)
// is intentionally distinct from IAuditWriter so site composition roots
// do not accidentally bind it; central composition roots that include
// AddConfigurationDatabase get a working implementation transparently.
// Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so
// NotificationOutboxActor + Inbound API rows are truncated + redacted
// before they hit MS SQL.
services.AddSingleton<ICentralAuditWriter>(sp => new CentralAuditWriter(
sp,
sp.GetRequiredService<ILogger<CentralAuditWriter>>(),
sp.GetRequiredService<IAuditPayloadFilter>()));
return services;
}
/// <summary>
/// Audit Log (#23) M2 Bundle G + M5 Bundle C — swap the default
/// <see cref="NoOpAuditWriteFailureCounter"/> and
/// <see cref="NoOpAuditRedactionFailureCounter"/> registrations for the
/// real <see cref="HealthMetricsAuditWriteFailureCounter"/> /
/// <see cref="HealthMetricsAuditRedactionFailureCounter"/> bridges so the
/// FallbackAuditWriter primary-failure counter AND the
/// DefaultAuditPayloadFilter redactor-failure counter both surface in the
/// site health report payload as
/// <c>SiteHealthReport.SiteAuditWriteFailures</c> +
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// <para>
/// Must be called AFTER both <see cref="AddAuditLog"/> (registers the
/// NoOp defaults this method replaces) and
/// <c>ScadaLink.HealthMonitoring.ServiceCollectionExtensions.AddHealthMonitoring</c>
/// or <c>AddSiteHealthMonitoring</c> (registers the
/// <see cref="ISiteHealthCollector"/> the bridges depend on). Resolving
/// <see cref="IAuditWriteFailureCounter"/> or
/// <see cref="IAuditRedactionFailureCounter"/> without the latter throws
/// <see cref="InvalidOperationException"/> at <c>GetRequiredService</c>
/// time — by design, since a silent NoOp would mask a misconfiguration.
/// </para>
/// <para>
/// Idempotent — calling twice replaces each descriptor without piling up
/// registrations.
/// </para>
/// <para>
/// Site-side only for M5: the central composition root keeps the NoOp
/// defaults; the central health-metric surface that would expose
/// <c>AuditRedactionFailure</c> next to the existing central counters
/// ships in M6.
/// </para>
/// </remarks>
public static IServiceCollection AddAuditLogHealthMetricsBridge(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
services.Replace(
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
services.Replace(
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
return services;
}
}