feat(auditlog): SiteAuditReconciliationActor central singleton (#23 M6)

2026-05-20 18:10:42 -04:00
parent 640fd07454
commit c763bd9a04
5 changed files with 901 additions and 0 deletions
@@ -0,0 +1,324 @@
+using Akka.Actor;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ScadaLink.Commons.Entities.Audit;
+using ScadaLink.Commons.Interfaces.Repositories;
+
+namespace ScadaLink.AuditLog.Central;
+
+/// <summary>
+/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
+/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
+/// known site, asks the site for any <see cref="AuditEvent"/> rows with
+/// <see cref="AuditEvent.OccurredAtUtc"/> &gt;= the site's last reconciled
+/// cursor, ingests them idempotently into the central
+/// <see cref="IAuditLogRepository"/>, and advances the cursor.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
+/// (<see cref="ScadaLink.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
+/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
+/// missed push (gRPC blip, central restart, site offline) is eventually
+/// repaired by central re-pulling whatever the site still has in
+/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
+/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
+/// arrivals from both paths a silent no-op.
+/// </para>
+/// <para>
+/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
+/// kept in-memory for the actor's lifetime. The cluster singleton normally
+/// survives the host process; on a deliberate failover OR a singleton restart
+/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
+/// but correct — the next tick simply asks for everything the site still has,
+/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
+/// considered and rejected for M6: the cost of a write per tick outweighs the
+/// rare benefit of avoiding one over-broad pull after a restart.
+/// </para>
+/// <para>
+/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
+/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
+/// — i.e. the backlog isn't draining. The actor publishes
+/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
+/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
+/// can flip the health metric without coupling this actor to the health
+/// collection surface today.
+/// </para>
+/// <para>
+/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
+/// repository write) must NOT prevent other sites from being polled on the
+/// same tick. The per-site work runs inside its own try/catch; the actor's
+/// supervisor strategy keeps it alive across any leaked exception with
+/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/>'s Restart
+/// semantics — restart resets the in-memory cursors, but as noted above that's
+/// a safe (over-pull, idempotent) recovery.
+/// </para>
+/// <para>
+/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
+/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
+/// opens one DI scope per tick and reuses the same repository across all
+/// sites in that tick — one DbContext per tick mirrors the
+/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
+/// </para>
+/// </remarks>
+public class SiteAuditReconciliationActor : ReceiveActor
+{
+    private readonly ISiteEnumerator _sites;
+    private readonly IPullAuditEventsClient _client;
+    private readonly IServiceProvider _services;
+    private readonly SiteAuditReconciliationOptions _options;
+    private readonly ILogger<SiteAuditReconciliationActor> _logger;
+
+    /// <summary>
+    /// Per-site reconciliation watermark — the highest
+    /// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
+    /// tick. Asking for <c>OccurredAtUtc &gt;= cursor</c> rather than &gt;
+    /// is the site contract (<see cref="ScadaLink.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
+    /// duplicate-with-same-timestamp rows are filtered out by the idempotent
+    /// repository write.
+    /// </summary>
+    private readonly Dictionary<string, DateTime> _cursors = new();
+
+    /// <summary>
+    /// Per-site count of consecutive non-draining cycles. Resets to zero on the
+    /// first draining (or empty) cycle.
+    /// </summary>
+    private readonly Dictionary<string, int> _nonDrainingCycles = new();
+
+    /// <summary>
+    /// Per-site latched stalled state — used so the actor only publishes a
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
+    /// stalled flag actually changes, not on every tick while stalled.
+    /// </summary>
+    private readonly Dictionary<string, bool> _stalled = new();
+
+    private ICancelable? _timer;
+
+    public SiteAuditReconciliationActor(
+        ISiteEnumerator sites,
+        IPullAuditEventsClient client,
+        IServiceProvider services,
+        IOptions<SiteAuditReconciliationOptions> options,
+        ILogger<SiteAuditReconciliationActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(sites);
+        ArgumentNullException.ThrowIfNull(client);
+        ArgumentNullException.ThrowIfNull(services);
+        ArgumentNullException.ThrowIfNull(options);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _sites = sites;
+        _client = client;
+        _services = services;
+        _options = options.Value;
+        _logger = logger;
+
+        ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
+    }
+
+    protected override void PreStart()
+    {
+        base.PreStart();
+        var interval = _options.ReconciliationInterval;
+        _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
+            initialDelay: interval,
+            interval: interval,
+            receiver: Self,
+            message: ReconciliationTick.Instance,
+            sender: Self);
+    }
+
+    protected override void PostStop()
+    {
+        _timer?.Cancel();
+        base.PostStop();
+    }
+
+    private async Task OnTickAsync()
+    {
+        IReadOnlyList<SiteEntry> sites;
+        try
+        {
+            sites = await _sites.EnumerateAsync().ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
+            return;
+        }
+
+        if (sites.Count == 0)
+        {
+            return;
+        }
+
+        IServiceScope? scope = null;
+        IAuditLogRepository repository;
+        try
+        {
+            scope = _services.CreateScope();
+            repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
+            scope?.Dispose();
+            return;
+        }
+
+        try
+        {
+            foreach (var site in sites)
+            {
+                try
+                {
+                    await PullSiteAsync(site, repository).ConfigureAwait(false);
+                }
+                catch (Exception ex)
+                {
+                    // Catch-all per the failure-isolation invariant: one site's
+                    // fault must not sink the rest of the tick. The cursor for
+                    // the failing site is left at its previous value so the
+                    // next tick retries the same window.
+                    _logger.LogWarning(
+                        ex,
+                        "Reconciliation pull failed for site {SiteId}; other sites continue.",
+                        site.SiteId);
+                }
+            }
+        }
+        finally
+        {
+            scope.Dispose();
+        }
+    }
+
+    /// <summary>
+    /// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
+    /// returned rows idempotently into the central repository, and advances
+    /// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
+    /// observed. The brief's "saturate until backlog clears" intent is met by
+    /// the natural cadence — each tick issues one pull, and a backed-up site
+    /// drains across consecutive ticks. The stalled signal (two non-draining
+    /// ticks in a row) surfaces when that drain isn't keeping up.
+    /// </summary>
+    private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository)
+    {
+        var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
+        var response = await _client.PullAsync(
+            site.SiteId, since, _options.BatchSize, CancellationToken.None)
+            .ConfigureAwait(false);
+
+        var maxOccurred = since;
+        var nowUtc = DateTime.UtcNow;
+        foreach (var evt in response.Events)
+        {
+            try
+            {
+                // Idempotent repository write: duplicate EventIds (from a
+                // concurrent push, or a retry of this very pull) collapse to
+                // a no-op courtesy of M2 Bundle A's race-fix on
+                // InsertIfNotExistsAsync.
+                var ingested = evt with { IngestedAtUtc = nowUtc };
+                await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
+            }
+            catch (Exception ex)
+            {
+                // Per-row catch so one bad event does not abandon the rest of
+                // the batch. The cursor still advances based on OccurredAtUtc
+                // — the row was returned by the site, so the next tick won't
+                // re-fetch it; if it permanently fails to persist, that's an
+                // operational concern surfaced by the log, not a hot-loop
+                // trigger.
+                _logger.LogError(
+                    ex,
+                    "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.",
+                    evt.EventId,
+                    site.SiteId);
+            }
+
+            if (evt.OccurredAtUtc > maxOccurred)
+            {
+                maxOccurred = evt.OccurredAtUtc;
+            }
+        }
+
+        _cursors[site.SiteId] = maxOccurred;
+
+        var nonDraining = response.MoreAvailable && response.Events.Count > 0;
+        UpdateStalledState(site.SiteId, draining: !nonDraining);
+    }
+
+    /// <summary>
+    /// Flips the per-site stalled flag based on whether this tick drained the
+    /// queue. A "draining" cycle is one where the server reported no more rows
+    /// available OR returned zero events. A "non-draining" cycle is the
+    /// inverse (events returned AND <c>MoreAvailable=true</c>).
+    /// </summary>
+    /// <remarks>
+    /// The state machine: counter increments on each consecutive non-draining
+    /// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
+    /// the actor latches <c>Stalled=true</c> and publishes the transition; on
+    /// any subsequent draining tick the counter resets to zero AND, if the
+    /// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
+    /// transitions are published — repeated ticks in the same state are
+    /// silent so a downstream subscriber doesn't see a flood of redundant
+    /// notifications.
+    /// </remarks>
+    private void UpdateStalledState(string siteId, bool draining)
+    {
+        var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
+
+        if (draining)
+        {
+            _nonDrainingCycles[siteId] = 0;
+            if (wasStalled)
+            {
+                _stalled[siteId] = false;
+                Context.System.EventStream.Publish(
+                    new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
+            }
+            return;
+        }
+
+        var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
+        _nonDrainingCycles[siteId] = consecutive;
+
+        if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
+        {
+            _stalled[siteId] = true;
+            Context.System.EventStream.Publish(
+                new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
+        }
+    }
+
+    /// <summary>
+    /// Resume on any unhandled exception inside the receive — the singleton
+    /// MUST stay alive even if the per-tick try/catch leaks. Restart would
+    /// reset the cursors (safe but wasteful); Resume preserves them.
+    /// </summary>
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(
+            maxNrOfRetries: 0,
+            withinTimeRange: TimeSpan.Zero,
+            decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
+    }
+
+    /// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
+    internal sealed class ReconciliationTick
+    {
+        public static readonly ReconciliationTick Instance = new();
+        private ReconciliationTick() { }
+    }
+}
+
+/// <summary>
+/// Published on the actor system EventStream when a site's reconciliation
+/// puller transitions into or out of the "stalled" state (backlog not
+/// draining across multiple cycles). The M6 Bundle E central health collector
+/// will subscribe to this and surface
+/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
+/// </summary>
+public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);