refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,81 @@
+using System.Collections.Concurrent;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of
+/// <see cref="IAuditCentralHealthSnapshot"/>. Owns thread-safe
+/// <see cref="System.Threading.Interlocked"/> counters for
+/// <c>CentralAuditWriteFailures</c> + <c>AuditRedactionFailure</c> and a
+/// per-site latched stalled-state map fed by the
+/// <see cref="SiteAuditTelemetryStalledTracker"/>. Also implements the
+/// writer surfaces (<see cref="ICentralAuditWriteFailureCounter"/> +
+/// <see cref="IAuditRedactionFailureCounter"/>) so a single concrete object
+/// is the source of truth — DI binds those two interfaces to this same
+/// singleton instance on the central composition root.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Why one type for read + write.</b> The writer interfaces are tiny
+/// (<c>Increment()</c>) and the read surface needs visibility of those
+/// counters anyway — having a single class own both means the
+/// <c>Interlocked</c> field IS the snapshot value, no extra plumbing needed.
+/// Mirrors the
+/// <see cref="ZB.MOM.WW.ScadaBridge.HealthMonitoring.SiteHealthCollector"/> pattern where
+/// the collector both receives and exposes the metric.
+/// </para>
+/// <para>
+/// <b>Stalled-state plumbing.</b> The per-site stalled latch lives directly
+/// on this snapshot. <see cref="SiteAuditTelemetryStalledTracker"/> is the
+/// EventStream subscriber that pushes
+/// <see cref="SiteAuditTelemetryStalledChanged"/> publications in via
+/// <see cref="ApplyStalled"/>. Keeping the dictionary on this type (rather
+/// than reading the tracker on every access) lets the snapshot be constructed
+/// without an <see cref="Akka.Actor.ActorSystem"/> dependency — the tracker
+/// is wired up later from the Akka bootstrap, once the system is built.
+/// </para>
+/// </remarks>
+public sealed class AuditCentralHealthSnapshot
+    : IAuditCentralHealthSnapshot,
+      ICentralAuditWriteFailureCounter,
+      IAuditRedactionFailureCounter
+{
+    private int _centralAuditWriteFailures;
+    private int _auditRedactionFailure;
+    private readonly ConcurrentDictionary<string, bool> _stalled = new();
+
+    /// <inheritdoc/>
+    public int CentralAuditWriteFailures =>
+        Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0);
+
+    /// <inheritdoc/>
+    public int AuditRedactionFailure =>
+        Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
+
+    /// <inheritdoc/>
+    public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
+        new Dictionary<string, bool>(_stalled);
+
+    /// <summary>
+    /// Apply a <see cref="SiteAuditTelemetryStalledChanged"/> publication
+    /// observed by <see cref="SiteAuditTelemetryStalledTracker"/>. Public
+    /// so the tracker (which lives in the same assembly but is constructed
+    /// later from the Akka host) can push without a friend reference;
+    /// readers should call <see cref="SiteAuditTelemetryStalled"/>.
+    /// </summary>
+    /// <param name="evt">The event carrying the site ID and new stalled state.</param>
+    public void ApplyStalled(SiteAuditTelemetryStalledChanged evt)
+    {
+        if (evt is null) return;
+        _stalled[evt.SiteId] = evt.Stalled;
+    }
+
+    /// <inheritdoc/>
+    void ICentralAuditWriteFailureCounter.Increment() =>
+        Interlocked.Increment(ref _centralAuditWriteFailures);
+
+    /// <inheritdoc/>
+    void IAuditRedactionFailureCounter.Increment() =>
+        Interlocked.Increment(ref _auditRedactionFailure);
+}
@@ -0,0 +1,306 @@
+using Akka.Actor;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Central-side singleton (per Bundle E wiring) that ingests batches of
+/// <see cref="AuditEvent"/> rows pushed from sites via the
+/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
+/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
+/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
+/// silently swallowed (first-write-wins per Bundle A's hardening).
+/// </summary>
+/// <remarks>
+/// <para>
+/// Idempotency is the contract: a row that already exists at central counts
+/// as "accepted" for the purposes of the reply, because the storage state is
+/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
+/// </para>
+/// <para>
+/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
+/// action. The actor wraps each repository call in its own try/catch so a
+/// single bad row cannot cause the rest of the batch to be lost — that
+/// per-row catch is what keeps this actor alive across handler throws, not
+/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
+/// returns the Akka default decider (Restart for most exceptions) and
+/// governs children only; this actor has no children today, so the override
+/// is a forward-compat placeholder.
+/// </para>
+/// <para>
+/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
+/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
+/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
+/// end to end), while Bundle E's host wiring registers the actor as a cluster
+/// singleton and must therefore resolve the repository — which is a scoped EF
+/// Core service — from a fresh DI scope per message. Mirroring the Notification
+/// Outbox actor's pattern.
+/// </para>
+/// </remarks>
+public class AuditLogIngestActor : ReceiveActor
+{
+    private readonly IServiceProvider? _serviceProvider;
+    private readonly IAuditLogRepository? _injectedRepository;
+    private readonly ILogger<AuditLogIngestActor> _logger;
+
+    /// <summary>
+    /// Test-mode constructor — injects a concrete repository instance whose
+    /// lifetime exceeds the test, so the actor reuses the same instance across
+    /// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
+    /// </summary>
+    /// <param name="repository">Audit log repository instance shared across all messages.</param>
+    /// <param name="logger">Logger for ingest diagnostics.</param>
+    public AuditLogIngestActor(
+        IAuditLogRepository repository,
+        ILogger<AuditLogIngestActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(repository);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _injectedRepository = repository;
+        _logger = logger;
+
+        ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
+        // The single-repository test ctor cannot service the M3 dual-write —
+        // it has no SiteCalls repo and no DbContext. The handler still
+        // registers (so callers don't dead-letter) but replies empty so the
+        // test surface stays explicit about what this ctor supports.
+        ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
+    }
+
+    /// <summary>
+    /// Production constructor — resolves <see cref="IAuditLogRepository"/> from
+    /// a fresh DI scope per message because the repository is a scoped EF Core
+    /// service registered by <c>AddConfigurationDatabase</c>. The actor itself
+    /// is a long-lived cluster singleton, so it cannot hold a scope across
+    /// messages.
+    /// </summary>
+    /// <param name="serviceProvider">Root service provider used to open a fresh scope per message.</param>
+    /// <param name="logger">Logger for ingest diagnostics.</param>
+    public AuditLogIngestActor(
+        IServiceProvider serviceProvider,
+        ILogger<AuditLogIngestActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(serviceProvider);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _serviceProvider = serviceProvider;
+        _logger = logger;
+
+        ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
+        ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
+    }
+
+    /// <inheritdoc />
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
+            Akka.Actor.SupervisorStrategy.DefaultDecider);
+    }
+
+    private async Task OnIngestAsync(IngestAuditEventsCommand cmd)
+    {
+        // Sender is captured before the first await — Akka resets Sender
+        // between message dispatches, so a post-await Tell would go to
+        // DeadLetters.
+        var replyTo = Sender;
+        var nowUtc = DateTime.UtcNow;
+        var accepted = new List<Guid>(cmd.Events.Count);
+
+        // Resolve the repository for the whole batch — one DbContext per
+        // message, mirroring NotificationOutboxActor. The injected-repository
+        // mode (Bundle D tests) skips the scope entirely.
+        // Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
+        // per-message scope when one is available so the row is truncated +
+        // redacted before InsertIfNotExistsAsync. The single-repository test
+        // ctor has no service provider — it falls through with no filter,
+        // which preserves the small-payload assumptions baked into the
+        // existing D2 fixtures.
+        // AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
+        // services (IAsyncDisposable DbContexts) dispose asynchronously
+        // without blocking on sync Dispose() of pending connection cleanup.
+        if (_injectedRepository is not null)
+        {
+            await IngestWithRepositoryAsync(_injectedRepository, filter: null, failureCounter: null, cmd, nowUtc, accepted)
+                .ConfigureAwait(false);
+        }
+        else
+        {
+            await using var scope = _serviceProvider!.CreateAsyncScope();
+            var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+            var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
+            // M6 Bundle E (T8): central health counter is best-effort —
+            // unregistered (test composition roots) means the per-row catch
+            // simply logs without surfacing on the health dashboard.
+            var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
+            await IngestWithRepositoryAsync(repository, filter, failureCounter, cmd, nowUtc, accepted)
+                .ConfigureAwait(false);
+        }
+
+        replyTo.Tell(new IngestAuditEventsReply(accepted));
+    }
+
+    private async Task IngestWithRepositoryAsync(
+        IAuditLogRepository repository,
+        IAuditPayloadFilter? filter,
+        ICentralAuditWriteFailureCounter? failureCounter,
+        IngestAuditEventsCommand cmd,
+        DateTime nowUtc,
+        List<Guid> accepted)
+    {
+        foreach (var evt in cmd.Events)
+        {
+            try
+            {
+                // Stamp IngestedAtUtc here, not at the site. Bundle A's
+                // repository hardening already swallows duplicate-key races,
+                // so the same id arriving twice (site retry, reconciliation)
+                // is a silent no-op.
+                // Filter BEFORE the IngestedAtUtc stamp so the redacted
+                // copy carries the central-side ingest timestamp. Filter
+                // is contract-bound to never throw. AuditLog-008: a null
+                // filter (test composition root, no IAuditPayloadFilter
+                // registered) now falls back to the SafeDefault rather than
+                // pass-through, so HTTP header redaction always runs.
+                var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
+                var filtered = safeFilter.Apply(evt);
+                var ingested = filtered with { IngestedAtUtc = nowUtc };
+                await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
+                accepted.Add(evt.EventId);
+            }
+            catch (Exception ex)
+            {
+                // Per-row catch — one bad row never sinks the whole batch.
+                // The row stays Pending at the site; the next drain retries.
+                // M6 Bundle E (T8): bump the central health counter so a
+                // sustained insert-throw failure surfaces on the dashboard.
+                try { failureCounter?.Increment(); }
+                catch { /* counter must never throw — defence in depth */ }
+                _logger.LogError(ex,
+                    "Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
+                    evt.EventId);
+            }
+        }
+    }
+
+    /// <summary>
+    /// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
+    /// actor opens a fresh MS SQL transaction, inserts the AuditLog row
+    /// idempotently AND upserts the SiteCalls row monotonically. Both succeed
+    /// or both roll back, so the audit and operational mirrors never drift
+    /// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
+    /// downstream join lines up cleanly.
+    /// </summary>
+    /// <remarks>
+    /// Per-entry isolation — one entry's failed transaction does NOT abort
+    /// other entries in the batch (each gets its own
+    /// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
+    /// scope and a try/catch around it). Audit-write failure NEVER aborts the
+    /// user-facing action — the site keeps the row Pending and retries on the
+    /// next drain.
+    /// </remarks>
+    private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
+    {
+        var replyTo = Sender;
+        var accepted = new List<Guid>(cmd.Entries.Count);
+
+        try
+        {
+            await using var scope = _serviceProvider!.CreateAsyncScope();
+            var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+            var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
+            var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
+            // Bundle C (M5-T6): resolve the filter for the whole batch from
+            // the scope; null = pass-through for test composition roots that
+            // skip the filter registration. The filter is contract-bound to
+            // never throw, so we can apply it inside the per-entry try
+            // without risking an unbounded blast radius.
+            var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
+            // M6 Bundle E (T8): same best-effort central health counter as
+            // the OnIngestAsync path — null on test composition roots that
+            // skip the registration.
+            var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
+
+            foreach (var entry in cmd.Entries)
+            {
+                try
+                {
+                    await using var tx = await dbContext.Database
+                        .BeginTransactionAsync()
+                        .ConfigureAwait(false);
+
+                    // Stamp IngestedAtUtc on both rows from a single
+                    // central-side instant so a join on the two tables sees
+                    // matching timestamps (debugging convenience, not a
+                    // correctness invariant).
+                    var ingestedAt = DateTime.UtcNow;
+                    // Filter the audit half BEFORE the dual-write — only the
+                    // AuditLog row's payload columns are filterable; SiteCalls
+                    // carries operational state only (status, retry count) and
+                    // is left untouched. AuditLog-008: null filter falls back
+                    // to SafeDefault so header redaction always runs.
+                    var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
+                    var filteredAudit = safeFilter.Apply(entry.Audit);
+                    var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
+                    var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
+
+                    await auditRepo.InsertIfNotExistsAsync(auditStamped)
+                        .ConfigureAwait(false);
+                    await siteCallRepo.UpsertAsync(siteCallStamped)
+                        .ConfigureAwait(false);
+
+                    await tx.CommitAsync().ConfigureAwait(false);
+                    accepted.Add(entry.Audit.EventId);
+                }
+                catch (Exception ex)
+                {
+                    // Both rows rolled back via the disposing transaction. The
+                    // EventId is NOT added to `accepted` so the site keeps its
+                    // row Pending and retries on the next drain. Other entries
+                    // in the batch continue with their own transactions.
+                    // M6 Bundle E (T8): bump the central health counter so a
+                    // sustained dual-write failure surfaces on the dashboard.
+                    try { failureCounter?.Increment(); }
+                    catch { /* counter must never throw — defence in depth */ }
+                    _logger.LogError(
+                        ex,
+                        "Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
+                        entry.Audit.EventId,
+                        entry.SiteCall.TrackedOperationId);
+                }
+            }
+        }
+        catch (Exception ex)
+        {
+            // Resolving the scope itself threw (e.g. DI mis-wiring). Log and
+            // reply with whatever we managed to accept (likely empty) — the
+            // central singleton MUST stay alive.
+            _logger.LogError(
+                ex,
+                "Combined telemetry batch ingest failed before per-entry processing.");
+        }
+
+        replyTo.Tell(new IngestCachedTelemetryReply(accepted));
+    }
+
+    /// <summary>
+    /// Fallback handler installed on the single-repository test ctor — that
+    /// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
+    /// it cannot service the dual-write. Logs a warning and replies with an
+    /// empty ack so callers fall through to their retry path.
+    /// </summary>
+    private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
+    {
+        _logger.LogWarning(
+            "AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
+            cmd.Entries.Count);
+        Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
+        return Task.CompletedTask;
+    }
+}
@@ -0,0 +1,37 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Tuning knobs for the central
+/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
+/// Defaults: once every 24 hours, keep at least one future monthly
+/// boundary ahead of <see cref="DateTime.UtcNow"/>.
+/// </summary>
+/// <remarks>
+/// <para>
+/// The hosted service drives a daily roll-forward of
+/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
+/// SPLITs new monthly boundaries until at least
+/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
+/// default is intentionally conservative — anything less risks an
+/// end-of-month race where inserts land in the unbounded tail partition;
+/// anything more wastes nothing but represents premature commitment.
+/// </para>
+/// <para>
+/// The 24-hour cadence is the cheapest interval that still guarantees
+/// at-most-one missed boundary in steady state (even a hard failover the
+/// hosted service can recover on its very next tick). Lowering this below
+/// an hour would generate more metadata churn than it saves.
+/// </para>
+/// </remarks>
+public sealed class AuditLogPartitionMaintenanceOptions
+{
+    /// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
+    public int IntervalSeconds { get; set; } = 86_400;
+
+    /// <summary>
+    /// Minimum number of future months that <c>pf_AuditLog_Month</c> must
+    /// cover after each tick. Default 1 — i.e. as of mid-May the partition
+    /// for the next full month (June) must already be present.
+    /// </summary>
+    public int LookaheadMonths { get; set; } = 1;
+}
@@ -0,0 +1,151 @@
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
+/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
+/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
+/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
+/// missing future boundaries — the partition function must always cover at
+/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
+/// future months, otherwise inserts past the highest boundary accumulate in
+/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
+/// cannot purge cleanly.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Why a hosted service, not an actor.</b> Bundle C's
+/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
+/// because it needs supervised lifecycle alongside the rest of the
+/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
+/// chore with no cross-actor coordination, so we use the much simpler
+/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
+/// between ticks, cancellation on stop. Reusing
+/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
+/// keeps the contract testable without any actor framework involvement.
+/// </para>
+/// <para>
+/// <b>Failure containment.</b> The tick body wraps the maintenance call in
+/// a try/catch so a transient SQL Server error never tears down the hosted
+/// service — the next tick simply retries. The exception is logged with
+/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
+/// E's central health collector) can subscribe to the logger to alert on
+/// repeated failures.
+/// </para>
+/// <para>
+/// <b>Startup ordering.</b> A first tick fires immediately at
+/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
+/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
+/// the partition function to come up to spec. This is also what the brief
+/// asks for ("Run once on startup").
+/// </para>
+/// <para>
+/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
+/// (alongside the rest of the EF repositories) because the implementation
+/// reuses the per-scope <c>ScadaBridgeDbContext</c>. A hosted service is a
+/// singleton, so it must open and dispose a scope around each tick — the
+/// same pattern <see cref="AuditLogPurgeActor"/> uses.
+/// </para>
+/// </remarks>
+public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
+{
+    private readonly IServiceScopeFactory _scopeFactory;
+    private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
+    private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
+    private CancellationTokenSource? _cts;
+    private Task? _loop;
+
+    /// <summary>
+    /// Initializes the maintenance service with its required dependencies.
+    /// </summary>
+    /// <param name="scopeFactory">Scope factory used to open DI scopes for each maintenance run.</param>
+    /// <param name="options">Partition maintenance options (retention period, purge interval, etc.).</param>
+    /// <param name="logger">Logger for this service.</param>
+    public AuditLogPartitionMaintenanceService(
+        IServiceScopeFactory scopeFactory,
+        IOptions<AuditLogPartitionMaintenanceOptions> options,
+        ILogger<AuditLogPartitionMaintenanceService> logger)
+    {
+        _scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
+        _options = options ?? throw new ArgumentNullException(nameof(options));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+    }
+
+    /// <inheritdoc />
+    public Task StartAsync(CancellationToken ct)
+    {
+        // Linked CTS lets StopAsync's cancellation AND the host's shutdown
+        // token both terminate the loop; either side firing aborts the
+        // pending Task.Delay.
+        _cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+        _loop = Task.Run(() => RunLoopAsync(_cts.Token));
+        return Task.CompletedTask;
+    }
+
+    private async Task RunLoopAsync(CancellationToken ct)
+    {
+        // Run once on startup so a fresh deployment isn't gated on the
+        // IntervalSeconds initial wait — the brief calls this out explicitly.
+        await SafeMaintainAsync(ct).ConfigureAwait(false);
+
+        while (!ct.IsCancellationRequested)
+        {
+            try
+            {
+                await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
+                    .ConfigureAwait(false);
+            }
+            catch (OperationCanceledException)
+            {
+                break;
+            }
+
+            await SafeMaintainAsync(ct).ConfigureAwait(false);
+        }
+    }
+
+    private async Task SafeMaintainAsync(CancellationToken ct)
+    {
+        try
+        {
+            await using var scope = _scopeFactory.CreateAsyncScope();
+            var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
+            var added = await maintenance
+                .EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
+                .ConfigureAwait(false);
+            if (added.Count > 0)
+            {
+                _logger.LogInformation(
+                    "AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
+                    added.Count,
+                    string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
+            }
+        }
+        catch (Exception ex)
+        {
+            // Catch-all is deliberate: the hosted service must survive every
+            // class of tick failure (transient SQL, DI resolution, etc.) so
+            // the next tick gets a chance. The brief's contract is
+            // "exception logged, not propagated".
+            _logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
+        }
+    }
+
+    /// <inheritdoc />
+    public Task StopAsync(CancellationToken ct)
+    {
+        _cts?.Cancel();
+        return _loop ?? Task.CompletedTask;
+    }
+
+    /// <inheritdoc />
+    public void Dispose()
+    {
+        _cts?.Dispose();
+    }
+}
@@ -0,0 +1,213 @@
+using System.Diagnostics;
+using Akka.Actor;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
+/// purge. On a configurable timer (default 24 hours) the actor:
+/// <list type="number">
+/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
+///       for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
+///       than <c>DateTime.UtcNow - RetentionDays</c>.</item>
+/// <item>For each eligible boundary, calls
+///       <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
+///       the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
+/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
+///       EventStream so the Bundle E central health collector + ops surfaces
+///       can subscribe without coupling to this actor.</item>
+/// </list>
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Daily cadence.</b> Partition switch is metadata-only but the
+/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
+/// more often than necessary trades unique-index rebuild outages for
+/// negligible freshness wins. The default 24-hour interval matches
+/// alog.md §10's retention policy.
+/// </para>
+/// <para>
+/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
+/// failure, contention with backup, missing object) must NOT prevent the
+/// other eligible boundaries from being purged on the same tick. Per-boundary
+/// work runs inside its own try/catch — that per-boundary catch is what
+/// keeps the singleton alive across handler throws. The
+/// <see cref="SupervisorStrategy"/> override returns the Akka default
+/// decider (Restart) and governs children only; this actor has no children
+/// today, so the override is a forward-compat placeholder.
+/// </para>
+/// <para>
+/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
+/// service registered by <c>AddConfigurationDatabase</c>. The singleton
+/// opens one DI scope per tick and reuses the same repository across every
+/// boundary in that tick — mirrors the
+/// <see cref="SiteAuditReconciliationActor"/> pattern.
+/// </para>
+/// <para>
+/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
+/// the EventStream rather than direct messaging avoids coupling this actor
+/// to its consumers; M6 Bundle E will subscribe a central health-counter
+/// bridge that surfaces purge progress on the central health report.
+/// </para>
+/// </remarks>
+public class AuditLogPurgeActor : ReceiveActor
+{
+    private readonly IServiceProvider _services;
+    private readonly AuditLogPurgeOptions _purgeOptions;
+    private readonly AuditLogOptions _auditOptions;
+    private readonly ILogger<AuditLogPurgeActor> _logger;
+    private ICancelable? _timer;
+
+    /// <summary>Initializes a new instance of <see cref="AuditLogPurgeActor"/> and registers the tick handler.</summary>
+    /// <param name="services">DI service provider used to create scoped repository instances per tick.</param>
+    /// <param name="purgeOptions">Options controlling the purge interval.</param>
+    /// <param name="auditOptions">Options controlling retention policy (RetentionDays).</param>
+    /// <param name="logger">Logger instance.</param>
+    public AuditLogPurgeActor(
+        IServiceProvider services,
+        IOptions<AuditLogPurgeOptions> purgeOptions,
+        IOptions<AuditLogOptions> auditOptions,
+        ILogger<AuditLogPurgeActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(services);
+        ArgumentNullException.ThrowIfNull(purgeOptions);
+        ArgumentNullException.ThrowIfNull(auditOptions);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _services = services;
+        _purgeOptions = purgeOptions.Value;
+        _auditOptions = auditOptions.Value;
+        _logger = logger;
+
+        ReceiveAsync<PurgeTick>(_ => OnTickAsync());
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        base.PreStart();
+        var interval = _purgeOptions.Interval;
+        _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
+            initialDelay: interval,
+            interval: interval,
+            receiver: Self,
+            message: PurgeTick.Instance,
+            sender: Self);
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _timer?.Cancel();
+        base.PostStop();
+    }
+
+    /// <inheritdoc />
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(
+            maxNrOfRetries: 0,
+            withinTimeRange: TimeSpan.Zero,
+            decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
+    }
+
+    private async Task OnTickAsync()
+    {
+        // Capture EventStream BEFORE the first await. Accessing Context (and
+        // therefore Context.System) after an await is unsafe because Akka's
+        // ActorBase.Context throws "no active ActorContext" once the
+        // continuation runs on a thread that isn't currently dispatching this
+        // actor — mirrors the same Sender-capture pattern in
+        // AuditLogIngestActor.OnIngestAsync.
+        var eventStream = Context.System.EventStream;
+
+        // Compute the retention threshold from AuditLogOptions.RetentionDays
+        // each tick — the options class supports hot reload via
+        // IOptionsMonitor for the redaction policy and similar settings; we
+        // read the snapshot per-tick so an operator who lowers RetentionDays
+        // sees the change applied on the next purge without an actor
+        // restart.
+        var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
+
+        // AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
+        // services (IAsyncDisposable DbContexts) dispose asynchronously
+        // without blocking on sync Dispose() of pending connection cleanup.
+        await using var scope = _services.CreateAsyncScope();
+        IAuditLogRepository repository;
+        try
+        {
+            repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
+            return;
+        }
+
+        IReadOnlyList<DateTime> boundaries;
+        try
+        {
+            boundaries = await repository
+                .GetPartitionBoundariesOlderThanAsync(threshold)
+                .ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(
+                ex,
+                "Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
+                threshold);
+            return;
+        }
+
+        if (boundaries.Count == 0)
+        {
+            return;
+        }
+
+        foreach (var boundary in boundaries)
+        {
+            // Per-boundary try/catch: one bad partition (transient SQL
+            // failure, missing object, contention with backup) does NOT
+            // abandon the rest of the tick.
+            var sw = Stopwatch.StartNew();
+            try
+            {
+                var rowsDeleted = await repository
+                    .SwitchOutPartitionAsync(boundary)
+                    .ConfigureAwait(false);
+                sw.Stop();
+
+                eventStream.Publish(
+                    new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
+
+                _logger.LogInformation(
+                    "Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
+                    boundary,
+                    rowsDeleted,
+                    sw.ElapsedMilliseconds);
+            }
+            catch (Exception ex)
+            {
+                sw.Stop();
+                _logger.LogError(
+                    ex,
+                    "Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
+                    boundary,
+                    sw.ElapsedMilliseconds);
+            }
+        }
+    }
+
+    /// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
+    internal sealed class PurgeTick
+    {
+        public static readonly PurgeTick Instance = new();
+        private PurgeTick() { }
+    }
+}
@@ -0,0 +1,43 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Tuning knobs for the central <see cref="AuditLogPurgeActor"/> singleton.
+/// Default cadence is 24 hours per the M6 plan; the retention window itself
+/// is sourced from <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.RetentionDays"/>
+/// (default 365) so operators tune retention from a single section.
+/// </summary>
+/// <remarks>
+/// <para>
+/// The purge actor is a daily-cadence singleton, not a hot-loop, because
+/// partition-switch I/O is metadata-only but the drop-and-rebuild dance
+/// briefly removes the <c>UX_AuditLog_EventId</c> unique index — running
+/// more often than necessary trades index-rebuild outages for marginal
+/// freshness gains. Lower this only when an operator can prove they need
+/// sub-daily purge granularity.
+/// </para>
+/// <para>
+/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
+/// milliseconds without polluting the production config surface; production
+/// binds <see cref="IntervalHours"/> only.
+/// </para>
+/// </remarks>
+public sealed class AuditLogPurgeOptions
+{
+    /// <summary>Period of the purge tick in hours (default 24).</summary>
+    public int IntervalHours { get; set; } = 24;
+
+    /// <summary>
+    /// Test-only override for finer control over the tick cadence than
+    /// whole-hour resolution allows. When non-null, takes precedence over
+    /// <see cref="IntervalHours"/>. Not bound from config — production
+    /// config exposes <see cref="IntervalHours"/> only.
+    /// </summary>
+    public TimeSpan? IntervalOverride { get; set; }
+
+    /// <summary>
+    /// Resolves the effective tick interval, honouring the test override
+    /// when set. Falls back to <see cref="IntervalHours"/>.
+    /// </summary>
+    public TimeSpan Interval =>
+        IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
+}
@@ -0,0 +1,29 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Published on the actor-system EventStream by <see cref="AuditLogPurgeActor"/>
+/// after each successful partition switch-out. Downstream consumers (Bundle E
+/// central health collector, ops dashboards, audit trails) subscribe so a
+/// purge action is observable without the actor needing to know about any
+/// specific subscriber.
+/// </summary>
+/// <param name="MonthBoundary">
+/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e.
+/// the first instant of the purged month in UTC.
+/// </param>
+/// <param name="RowsDeleted">
+/// Approximate row count purged from the partition, sampled BEFORE the
+/// switch. Exact accounting would require a post-switch scan of the staging
+/// table, which the dance drops immediately, so this is the closest
+/// observable proxy. Zero is a valid value when the actor's enumerator
+/// included a partition the operator subsequently emptied by hand.
+/// </param>
+/// <param name="DurationMs">
+/// Wall-clock time spent inside <c>SwitchOutPartitionAsync</c> for this
+/// boundary, in milliseconds. Useful for spotting the rare slow purge
+/// without spinning up dedicated telemetry.
+/// </param>
+public sealed record AuditLogPurgedEvent(
+    DateTime MonthBoundary,
+    long RowsDeleted,
+    long DurationMs);
@@ -0,0 +1,61 @@
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E (T9) — bridges
+/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
+/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
+/// parameter redactor stage throws and the filter has to over-redact the
+/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
+/// failure surfaces on the central health surface as
+/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Site vs central.</b> M5 Bundle C wired the SITE-side bridge
+/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>),
+/// which routes increments into the site health report payload's
+/// <c>AuditRedactionFailure</c> field. That handles redactor failures on the
+/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the
+/// MIRROR bridge here so the same payload filter — when it runs on the
+/// central <see cref="CentralAuditWriter"/> /
+/// <see cref="AuditLogIngestActor"/> paths — surfaces its failures on the
+/// central dashboard rather than disappearing into a NoOp.
+/// </para>
+/// <para>
+/// <b>Registration shape.</b> Site composition roots call
+/// <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>,
+/// which overrides the binding with the site bridge. Central composition
+/// roots call <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>,
+/// which overrides with this central bridge. A node never wears both hats —
+/// site and central are distinct host roles — so the two bridges never
+/// fight over the same binding at runtime.
+/// </para>
+/// <para>
+/// <b>Why not a thin wrapper around the snapshot directly?</b> The snapshot
+/// itself <i>could</i> be the bound implementation (it already implements
+/// <see cref="IAuditRedactionFailureCounter"/>), but a dedicated class makes
+/// the central-vs-site asymmetry explicit at the DI boundary — readers of
+/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>
+/// see "site → site bridge, central → central bridge", matching the
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>
+/// shape one-for-one.
+/// </para>
+/// </remarks>
+public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter
+{
+    private readonly AuditCentralHealthSnapshot _snapshot;
+
+    /// <summary>
+    /// Initializes a new <see cref="CentralAuditRedactionFailureCounter"/> backed by the supplied snapshot.
+    /// </summary>
+    /// <param name="snapshot">The central health snapshot that accumulates the redaction failure count.</param>
+    public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot)
+    {
+        _snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
+    }
+
+    /// <inheritdoc/>
+    public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment();
+}
@@ -0,0 +1,159 @@
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Central-only direct-write implementation of <see cref="ICentralAuditWriter"/>.
+/// Wraps <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> as a best-effort
+/// audit emission path for components that originate audit events ON the central
+/// node (Notification Outbox dispatch, Inbound API) — NOT for site telemetry
+/// ingest (that path is the SiteAudit → AuditLogIngestActor batched flow).
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Best-effort contract.</b> Audit-write failures NEVER abort the user-facing
+/// action (alog.md §13). The writer catches every exception thrown by repository
+/// resolution or the insert call, logs at warning, and returns successfully.
+/// Callers may still wrap the call in their own try/catch (defensive — the writer
+/// is supposed to swallow).
+/// </para>
+/// <para>
+/// <b>Scope-per-call resolution.</b> <see cref="IAuditLogRepository"/> is a SCOPED
+/// EF Core service (registered by <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase</c>). The
+/// writer itself is registered as a singleton (so all callers share one instance),
+/// so it cannot hold a scope across calls — it opens a fresh
+/// <see cref="IServiceScope"/> per <see cref="WriteAsync"/> invocation, mirroring
+/// the per-message scope pattern used by <c>AuditLogIngestActor</c> and
+/// <c>NotificationOutboxActor</c>.
+/// </para>
+/// <para>
+/// <b>Idempotency.</b> Persistence is via <c>InsertIfNotExistsAsync</c>, so a
+/// double-emitted event (same <see cref="AuditEvent.EventId"/>) is a silent
+/// no-op — the writer is safe to call from any number of dispatch paths.
+/// </para>
+/// </remarks>
+public sealed class CentralAuditWriter : ICentralAuditWriter
+{
+    private readonly IServiceProvider _services;
+    private readonly ILogger<CentralAuditWriter> _logger;
+    private readonly IAuditPayloadFilter _filter;
+    private readonly ICentralAuditWriteFailureCounter _failureCounter;
+    private readonly INodeIdentityProvider? _nodeIdentity;
+
+    /// <summary>
+    /// Bundle C (M5-T6) — the central direct-write path used by the
+    /// NotificationOutboxActor dispatch and the Inbound API middleware also
+    /// needs to truncate + redact before the row hits MS SQL. The filter is
+    /// optional so the M4 test composition roots that don't pass one keep
+    /// working (they only ever write small payloads); production DI registers
+    /// the real filter via <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
+    /// M6 Bundle E (T8) — adds the optional
+    /// <see cref="ICentralAuditWriteFailureCounter"/> so a swallowed repository
+    /// throw bumps the central health surface's
+    /// <c>CentralAuditWriteFailures</c> counter. Defaults to a NoOp so test
+    /// composition roots that don't wire the counter keep their current
+    /// behaviour. SourceNode-stamping (Task 12) — adds the optional
+    /// <see cref="INodeIdentityProvider"/> so central-origin rows (Notification
+    /// Outbox dispatch, Inbound API) carry the writing central node's
+    /// identifier when the caller hasn't already supplied one. Optional /
+    /// defaulting-to-null so M4 test composition roots that don't pass a
+    /// provider keep working — the caller-wins discipline means an absent
+    /// provider simply leaves SourceNode at whatever the caller set (often
+    /// null, which is the legacy behaviour).
+    /// </summary>
+    /// <param name="services">Service provider used to open a per-call scope for the scoped repository.</param>
+    /// <param name="logger">Logger for swallowed write-failure diagnostics.</param>
+    /// <param name="filter">Optional payload filter for truncation and redaction; defaults to a pass-through.</param>
+    /// <param name="failureCounter">Optional counter incremented on swallowed repository failures; defaults to a no-op.</param>
+    /// <param name="nodeIdentity">Optional node identity provider for stamping <c>SourceNode</c> on central-origin rows.</param>
+    public CentralAuditWriter(
+        IServiceProvider services,
+        ILogger<CentralAuditWriter> logger,
+        IAuditPayloadFilter? filter = null,
+        ICentralAuditWriteFailureCounter? failureCounter = null,
+        INodeIdentityProvider? nodeIdentity = null)
+    {
+        _services = services ?? throw new ArgumentNullException(nameof(services));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+        // AuditLog-008: never default to null — over-redact instead.
+        // SafeDefaultAuditPayloadFilter applies HTTP header redaction with
+        // hard-coded sensitive defaults so a composition root that omits the
+        // real filter still scrubs Authorization / X-Api-Key / Cookie /
+        // Set-Cookie before persistence.
+        _filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
+        _failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
+        _nodeIdentity = nodeIdentity;
+    }
+
+    /// <inheritdoc />
+    public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
+    {
+        if (evt is null)
+        {
+            // Defensive — a null event is a programming bug at the caller and
+            // produces no meaningful audit row. Log and return.
+            _logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
+            return;
+        }
+
+        try
+        {
+            // Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
+            // filter contract is "never throws". AuditLog-008: _filter is now
+            // non-null (SafeDefaultAuditPayloadFilter fallback) so header
+            // redaction always runs even in composition roots that omit the
+            // real filter.
+            var filtered = _filter.Apply(evt);
+
+            // SourceNode-stamping (Task 12): caller-provided value wins
+            // (supports any future direct-write callsite that already has its
+            // own node id); otherwise stamp from the local
+            // INodeIdentityProvider, when one is wired. Production DI on
+            // central nodes always supplies the provider; legacy test
+            // composition roots that don't pass it leave SourceNode at
+            // whatever the caller set (often null), preserving back-compat.
+            if (filtered.SourceNode is null && _nodeIdentity?.NodeName is { } nodeName)
+            {
+                filtered = filtered with { SourceNode = nodeName };
+            }
+
+            await using var scope = _services.CreateAsyncScope();
+            var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+            var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
+            await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            // Audit failure NEVER aborts the user-facing action — swallow and log.
+            // M6 Bundle E (T8): also surface the failure on the central health
+            // counter so a sustained audit-write outage is visible on the
+            // health dashboard rather than disappearing into the log file.
+            try
+            {
+                _failureCounter.Increment();
+            }
+            catch
+            {
+                // Counter must NEVER throw — defence in depth. Even if a
+                // misbehaving custom counter does, swallowing here keeps the
+                // best-effort contract intact.
+            }
+            // Log the input event's identifying fields. These three (EventId,
+            // Kind, Status) are immutable across the filter+stamp chain — the
+            // `with` clones above touch only SourceNode and IngestedAtUtc — so
+            // referencing `evt` here is intentional and equivalent to the
+            // stamped record for diagnostics. If you add a field here that the
+            // stamp chain DOES mutate (e.g., SourceNode), reference the latest
+            // post-stamp record name instead, not `evt`.
+            _logger.LogWarning(
+                ex,
+                "CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
+                evt.EventId, evt.Kind, evt.Status);
+        }
+    }
+}
@@ -0,0 +1,62 @@
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side
+/// audit-health counters: <see cref="CentralAuditWriteFailures"/> (every
+/// repository insert throw from <see cref="CentralAuditWriter"/> /
+/// <see cref="AuditLogIngestActor"/>), <see cref="AuditRedactionFailure"/>
+/// (every payload-filter redactor throw on the central path), and
+/// <see cref="SiteAuditTelemetryStalled"/> (per-site latched state from the
+/// <see cref="SiteAuditTelemetryStalledTracker"/>).
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Read-only contract.</b> Implementations expose a point-in-time snapshot
+/// — increments and tracker updates happen through the dedicated counter /
+/// tracker interfaces, not through this surface. Consumers (M7+ central
+/// health pages) read these properties; they never mutate.
+/// </para>
+/// <para>
+/// <b>Why a parallel surface from <see cref="ICentralHealthAggregator"/>.</b>
+/// <see cref="ICentralHealthAggregator"/> aggregates per-site
+/// <c>SiteHealthState</c> reports the SITE emits. The central audit-write
+/// failure / redaction-failure counters originate ON central (no site report
+/// carries them), so they live on a dedicated snapshot rather than being
+/// retro-fitted into a per-site state. The two surfaces will be composed at
+/// the M7 dashboard layer.
+/// </para>
+/// </remarks>
+public interface IAuditCentralHealthSnapshot
+{
+    /// <summary>
+    /// Count of central-side audit-write failures since process start.
+    /// Incremented by every <see cref="CentralAuditWriter"/> /
+    /// <see cref="AuditLogIngestActor"/> repository insert that throws.
+    /// </summary>
+    int CentralAuditWriteFailures { get; }
+
+    /// <summary>
+    /// Count of central-side payload-filter redactor over-redactions since
+    /// process start. Incremented by every header / body / SQL-parameter
+    /// redactor stage that throws (the filter falls back to the
+    /// <c>&lt;redacted: redactor error&gt;</c> marker and never aborts the
+    /// user-facing action). Sites have their own counter
+    /// (<see cref="IAuditRedactionFailureCounter"/>-backed
+    /// <c>SiteHealthReport.AuditRedactionFailure</c>) and the central
+    /// composition root's binding routes ALL central redactor throws
+    /// (CentralAuditWriter + AuditLogIngestActor paths) into this counter.
+    /// </summary>
+    int AuditRedactionFailure { get; }
+
+    /// <summary>
+    /// Per-site latched stalled state: <c>true</c> when the
+    /// <see cref="SiteAuditReconciliationActor"/> has observed two
+    /// consecutive non-draining cycles for that site, <c>false</c> after the
+    /// first draining cycle. Sites absent from the map are interpreted as
+    /// healthy (<c>Stalled=false</c> default). Snapshot is a defensive
+    /// copy — readers must not mutate.
+    /// </summary>
+    IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled { get; }
+}
@@ -0,0 +1,23 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit
+/// writers (<see cref="CentralAuditWriter"/>, <see cref="AuditLogIngestActor"/>)
+/// every time a repository <c>InsertIfNotExistsAsync</c> throws. Mirrors the
+/// site-side <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.IAuditWriteFailureCounter"/>
+/// shape one-for-one — same one-method contract, same NoOp default, same
+/// must-never-abort-the-user-facing-action invariant.
+/// </summary>
+/// <remarks>
+/// Audit-write failures NEVER abort the user-facing action (alog.md §13) —
+/// the writer swallows the exception and surfaces the failure via this counter
+/// instead. A NoOp default is the correct safe fallback while the central
+/// health surface is being wired in; <see cref="AuditCentralHealthSnapshot"/>
+/// is the production binding that routes increments into the aggregated
+/// central health snapshot consumed by future M7+ pages.
+/// </remarks>
+public interface ICentralAuditWriteFailureCounter
+{
+    /// <summary>Increment the central audit-write failure counter by one.</summary>
+    void Increment();
+}
@@ -0,0 +1,49 @@
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Mockable abstraction over the central-side <c>PullAuditEvents</c> gRPC
+/// client surface that <see cref="SiteAuditReconciliationActor"/> uses to
+/// fetch the next reconciliation batch from a specific site. Extracted so the
+/// actor can be unit-tested against an in-memory stub without standing up a
+/// real <c>GrpcChannel</c> per site.
+/// </summary>
+/// <remarks>
+/// <para>
+/// The production implementation (host wiring task) wraps the auto-generated
+/// <c>SiteStreamService.SiteStreamServiceClient</c>, multiplexing one
+/// <c>GrpcChannel</c> per site keyed on
+/// <see cref="SiteEntry.GrpcEndpoint"/>. Until that wiring lands the DI
+/// composition root binds a NoOp default that returns an empty response — the
+/// reconciliation tick is still scheduled and the cursor logic still runs, so
+/// regressions in the actor itself are caught even before the real client
+/// arrives.
+/// </para>
+/// <para>
+/// Implementations MUST NOT throw on transport faults that the actor can
+/// tolerate (connection refused, deadline exceeded). The actor's contract is
+/// "one site's failure doesn't sink the rest of the tick"; an exception still
+/// won't crash the actor (the per-site try/catch catches it), but returning
+/// an empty response on a known-recoverable error keeps the logs cleaner.
+/// </para>
+/// </remarks>
+public interface IPullAuditEventsClient
+{
+    /// <summary>
+    /// Issues a <c>PullAuditEvents</c> RPC against the site whose endpoint
+    /// is registered against <paramref name="siteId"/>. Returns the next
+    /// batch of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
+    /// rows ordered oldest-first AND a <c>MoreAvailable</c> flag the actor
+    /// uses to decide whether to fire another pull immediately.
+    /// </summary>
+    /// <param name="siteId">The identifier of the site to pull audit events from.</param>
+    /// <param name="sinceUtc">Only events with an <c>OccurredAtUtc</c> at or after this cursor time are returned.</param>
+    /// <param name="batchSize">Maximum number of events to return per call.</param>
+    /// <param name="ct">Cancellation token.</param>
+    Task<PullAuditEventsResponse> PullAsync(
+        string siteId,
+        DateTime sinceUtc,
+        int batchSize,
+        CancellationToken ct);
+}
@@ -0,0 +1,35 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Enumeration surface consumed by <see cref="SiteAuditReconciliationActor"/> to
+/// discover which sites to poll on each reconciliation tick. Extracted so the
+/// actor can be unit-tested against a static list without depending on the
+/// production <c>ISiteRepository</c> + EF Core DbContext.
+/// </summary>
+/// <remarks>
+/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
+/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
+/// site's configured <c>GrpcNodeAAddress</c> (falling back to
+/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
+/// configured are silently skipped — the reconciliation pull cannot reach
+/// them, but absence of an address is a configuration decision, not a runtime
+/// error.
+/// </remarks>
+public interface ISiteEnumerator
+{
+    /// <summary>
+    /// Returns the current set of sites the reconciliation puller should visit
+    /// on the next tick. Implementations should reflect adds/removes promptly
+    /// — the actor calls this once per tick.
+    /// </summary>
+    /// <param name="ct">Cancellation token for the async enumeration.</param>
+    Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
+}
+
+/// <summary>
+/// One reconciliation target: the site identifier the actor uses as the
+/// cursor key and the gRPC endpoint <see cref="IPullAuditEventsClient"/> dials
+/// to issue the pull. Endpoint is the bare authority (e.g. <c>http://siteA:8083</c>);
+/// transport selection (TLS, keepalive, etc.) is the client's concern.
+/// </summary>
+public sealed record SiteEntry(string SiteId, string GrpcEndpoint);
@@ -0,0 +1,17 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Default <see cref="ICentralAuditWriteFailureCounter"/> binding used when
+/// the central health surface (<see cref="AuditCentralHealthSnapshot"/>) has
+/// not been wired (test composition roots, site-only hosts that incidentally
+/// resolve a <see cref="CentralAuditWriter"/>). Drops every increment on the
+/// floor. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.NoOpAuditWriteFailureCounter"/>.
+/// </summary>
+public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter
+{
+    /// <inheritdoc/>
+    public void Increment()
+    {
+        // intentional no-op
+    }
+}
@@ -0,0 +1,387 @@
+using Akka.Actor;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
+/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
+/// known site, asks the site for any <see cref="AuditEvent"/> rows with
+/// <see cref="AuditEvent.OccurredAtUtc"/> &gt;= the site's last reconciled
+/// cursor, ingests them idempotently into the central
+/// <see cref="IAuditLogRepository"/>, and advances the cursor.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
+/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
+/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
+/// missed push (gRPC blip, central restart, site offline) is eventually
+/// repaired by central re-pulling whatever the site still has in
+/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
+/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
+/// arrivals from both paths a silent no-op.
+/// </para>
+/// <para>
+/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
+/// kept in-memory for the actor's lifetime. The cluster singleton normally
+/// survives the host process; on a deliberate failover OR a singleton restart
+/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
+/// but correct — the next tick simply asks for everything the site still has,
+/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
+/// considered and rejected for M6: the cost of a write per tick outweighs the
+/// rare benefit of avoiding one over-broad pull after a restart.
+/// </para>
+/// <para>
+/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
+/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
+/// — i.e. the backlog isn't draining. The actor publishes
+/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
+/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
+/// can flip the health metric without coupling this actor to the health
+/// collection surface today.
+/// </para>
+/// <para>
+/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
+/// repository write) must NOT prevent other sites from being polled on the
+/// same tick. The per-site work runs inside its own try/catch — that
+/// per-site catch is what keeps the actor running across handler throws.
+/// The <see cref="SupervisorStrategy"/> override returns
+/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart
+/// semantics) and governs children only; this actor has no children today,
+/// so the override is a forward-compat placeholder. If it ever did fire,
+/// restart would reset the in-memory cursors — but as noted above that's
+/// a safe (over-pull, idempotent) recovery.
+/// </para>
+/// <para>
+/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
+/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
+/// opens one DI scope per tick and reuses the same repository across all
+/// sites in that tick — one DbContext per tick mirrors the
+/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
+/// </para>
+/// </remarks>
+public class SiteAuditReconciliationActor : ReceiveActor
+{
+    private readonly ISiteEnumerator _sites;
+    private readonly IPullAuditEventsClient _client;
+    private readonly IServiceProvider _services;
+    private readonly SiteAuditReconciliationOptions _options;
+    private readonly ILogger<SiteAuditReconciliationActor> _logger;
+
+    /// <summary>
+    /// Per-site reconciliation watermark — the highest
+    /// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
+    /// tick. Asking for <c>OccurredAtUtc &gt;= cursor</c> rather than &gt;
+    /// is the site contract (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
+    /// duplicate-with-same-timestamp rows are filtered out by the idempotent
+    /// repository write.
+    /// </summary>
+    private readonly Dictionary<string, DateTime> _cursors = new();
+
+    /// <summary>
+    /// Per-site count of consecutive non-draining cycles. Resets to zero on the
+    /// first draining (or empty) cycle.
+    /// </summary>
+    private readonly Dictionary<string, int> _nonDrainingCycles = new();
+
+    /// <summary>
+    /// Per-site latched stalled state — used so the actor only publishes a
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
+    /// stalled flag actually changes, not on every tick while stalled.
+    /// </summary>
+    private readonly Dictionary<string, bool> _stalled = new();
+
+    /// <summary>
+    /// AuditLog-004: per-EventId retry counter for rows whose central insert
+    /// threw. While a row keeps failing AND is below
+    /// <see cref="MaxPermanentInsertAttempts"/>, the cursor is held back so the
+    /// next reconciliation tick re-pulls and retries the row. Crossing the
+    /// threshold logs Critical and permanently abandons the row (cursor
+    /// advances past it) so a truly broken row cannot block all subsequent
+    /// progress for a site. The counter is in-memory only — singleton restart
+    /// resets it, which is safe because the cursor also resets on restart and
+    /// the next tick re-pulls everything.
+    /// </summary>
+    private readonly Dictionary<Guid, int> _failedInsertAttempts = new();
+
+    /// <summary>
+    /// AuditLog-004: number of consecutive central-insert failures before a row
+    /// is permanently abandoned with a Critical log entry and the cursor is
+    /// allowed to advance past it. Five attempts at the 5-minute default tick
+    /// is ~25 min of retry budget before a stuck row stops blocking progress.
+    /// </summary>
+    private const int MaxPermanentInsertAttempts = 5;
+
+    private ICancelable? _timer;
+
+    /// <summary>
+    /// Initializes the reconciliation actor with its dependencies and registers the tick handler.
+    /// </summary>
+    /// <param name="sites">Enumerates the known sites to reconcile.</param>
+    /// <param name="client">Client used to pull audit events from individual sites.</param>
+    /// <param name="services">Root service provider for opening a per-tick DI scope.</param>
+    /// <param name="options">Reconciliation configuration (interval, page size).</param>
+    /// <param name="logger">Logger for reconciliation diagnostics.</param>
+    public SiteAuditReconciliationActor(
+        ISiteEnumerator sites,
+        IPullAuditEventsClient client,
+        IServiceProvider services,
+        IOptions<SiteAuditReconciliationOptions> options,
+        ILogger<SiteAuditReconciliationActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(sites);
+        ArgumentNullException.ThrowIfNull(client);
+        ArgumentNullException.ThrowIfNull(services);
+        ArgumentNullException.ThrowIfNull(options);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _sites = sites;
+        _client = client;
+        _services = services;
+        _options = options.Value;
+        _logger = logger;
+
+        ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        base.PreStart();
+        var interval = _options.ReconciliationInterval;
+        _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
+            initialDelay: interval,
+            interval: interval,
+            receiver: Self,
+            message: ReconciliationTick.Instance,
+            sender: Self);
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _timer?.Cancel();
+        base.PostStop();
+    }
+
+    private async Task OnTickAsync()
+    {
+        // Capture EventStream BEFORE the first await. Accessing Context (and
+        // therefore Context.System) after an await is unsafe because Akka's
+        // ActorBase.Context throws "no active ActorContext" once the
+        // continuation runs on a thread that isn't currently dispatching this
+        // actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the
+        // AuditLogIngestActor.OnIngestAsync Sender-capture pattern.
+        var eventStream = Context.System.EventStream;
+
+        IReadOnlyList<SiteEntry> sites;
+        try
+        {
+            sites = await _sites.EnumerateAsync().ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
+            return;
+        }
+
+        if (sites.Count == 0)
+        {
+            return;
+        }
+
+        // AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
+        // services (IAsyncDisposable DbContexts) dispose asynchronously
+        // without blocking on sync Dispose() of pending connection cleanup.
+        await using var scope = _services.CreateAsyncScope();
+        IAuditLogRepository repository;
+        try
+        {
+            repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
+            return;
+        }
+
+        foreach (var site in sites)
+        {
+            try
+            {
+                await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false);
+            }
+            catch (Exception ex)
+            {
+                // Catch-all per the failure-isolation invariant: one site's
+                // fault must not sink the rest of the tick. The cursor for
+                // the failing site is left at its previous value so the
+                // next tick retries the same window.
+                _logger.LogWarning(
+                    ex,
+                    "Reconciliation pull failed for site {SiteId}; other sites continue.",
+                    site.SiteId);
+            }
+        }
+    }
+
+    /// <summary>
+    /// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
+    /// returned rows idempotently into the central repository, and advances
+    /// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
+    /// observed. The brief's "saturate until backlog clears" intent is met by
+    /// the natural cadence — each tick issues one pull, and a backed-up site
+    /// drains across consecutive ticks. The stalled signal (two non-draining
+    /// ticks in a row) surfaces when that drain isn't keeping up.
+    /// </summary>
+    private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream)
+    {
+        var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
+        var response = await _client.PullAsync(
+            site.SiteId, since, _options.BatchSize, CancellationToken.None)
+            .ConfigureAwait(false);
+
+        var maxOccurred = since;
+        var hasUnresolvedFailure = false;
+        var nowUtc = DateTime.UtcNow;
+        foreach (var evt in response.Events)
+        {
+            var advanceForThisRow = false;
+            try
+            {
+                // Idempotent repository write: duplicate EventIds (from a
+                // concurrent push, or a retry of this very pull) collapse to
+                // a no-op courtesy of M2 Bundle A's race-fix on
+                // InsertIfNotExistsAsync.
+                var ingested = evt with { IngestedAtUtc = nowUtc };
+                await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
+                _failedInsertAttempts.Remove(evt.EventId);
+                advanceForThisRow = true;
+            }
+            catch (Exception ex)
+            {
+                // AuditLog-004: per-row catch so one bad event does not abandon
+                // the rest of the batch. Track the failure count per EventId —
+                // below MaxPermanentInsertAttempts the cursor is HELD BACK so
+                // the next tick re-pulls and retries; at the threshold the row
+                // is permanently abandoned (LogCritical + cursor advances past)
+                // to keep a truly broken row from blocking all subsequent
+                // progress for the site.
+                var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1;
+                _failedInsertAttempts[evt.EventId] = attempts;
+
+                if (attempts >= MaxPermanentInsertAttempts)
+                {
+                    _logger.LogCritical(
+                        ex,
+                        "Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.",
+                        evt.EventId,
+                        site.SiteId,
+                        attempts);
+                    _failedInsertAttempts.Remove(evt.EventId);
+                    advanceForThisRow = true;
+                }
+                else
+                {
+                    _logger.LogError(
+                        ex,
+                        "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.",
+                        evt.EventId,
+                        site.SiteId,
+                        attempts,
+                        MaxPermanentInsertAttempts);
+                    hasUnresolvedFailure = true;
+                }
+            }
+
+            if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
+            {
+                maxOccurred = evt.OccurredAtUtc;
+            }
+        }
+
+        // AuditLog-004: only advance the persisted cursor if no event in this
+        // batch is still being retried. Leaving the cursor at `since` re-pulls
+        // the whole batch next tick — successful rows are no-ops thanks to
+        // InsertIfNotExistsAsync's idempotency, and the failing row gets
+        // another attempt. Once it succeeds (or hits the permanent-abandon
+        // threshold) the cursor unblocks naturally.
+        _cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred;
+
+        var nonDraining = response.MoreAvailable && response.Events.Count > 0;
+        UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
+    }
+
+    /// <summary>
+    /// Flips the per-site stalled flag based on whether this tick drained the
+    /// queue. A "draining" cycle is one where the server reported no more rows
+    /// available OR returned zero events. A "non-draining" cycle is the
+    /// inverse (events returned AND <c>MoreAvailable=true</c>).
+    /// </summary>
+    /// <remarks>
+    /// The state machine: counter increments on each consecutive non-draining
+    /// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
+    /// the actor latches <c>Stalled=true</c> and publishes the transition; on
+    /// any subsequent draining tick the counter resets to zero AND, if the
+    /// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
+    /// transitions are published — repeated ticks in the same state are
+    /// silent so a downstream subscriber doesn't see a flood of redundant
+    /// notifications.
+    /// </remarks>
+    private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream)
+    {
+        var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
+
+        if (draining)
+        {
+            _nonDrainingCycles[siteId] = 0;
+            if (wasStalled)
+            {
+                _stalled[siteId] = false;
+                eventStream.Publish(
+                    new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
+            }
+            return;
+        }
+
+        var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
+        _nonDrainingCycles[siteId] = consecutive;
+
+        if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
+        {
+            _stalled[siteId] = true;
+            eventStream.Publish(
+                new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
+        }
+    }
+
+    /// <inheritdoc />
+    protected override SupervisorStrategy SupervisorStrategy()
+    {
+        return new OneForOneStrategy(
+            maxNrOfRetries: 0,
+            withinTimeRange: TimeSpan.Zero,
+            decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
+    }
+
+    /// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
+    internal sealed class ReconciliationTick
+    {
+        public static readonly ReconciliationTick Instance = new();
+        private ReconciliationTick() { }
+    }
+}
+
+/// <summary>
+/// Published on the actor system EventStream when a site's reconciliation
+/// puller transitions into or out of the "stalled" state (backlog not
+/// draining across multiple cycles). The M6 Bundle E central health collector
+/// will subscribe to this and surface
+/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
+/// </summary>
+public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);
@@ -0,0 +1,60 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Tuning knobs for the central <see cref="SiteAuditReconciliationActor"/> singleton.
+/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per
+/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty
+/// AND <c>MoreAvailable=true</c> (the backlog is not draining).
+/// </summary>
+/// <remarks>
+/// <para>
+/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is
+/// lost; it is intentionally low-frequency. Lowering
+/// <see cref="ReconciliationIntervalSeconds"/> in production trades MS SQL load for
+/// fresher self-healing — keep the default unless a deployment can prove the extra
+/// load is acceptable.
+/// </para>
+/// <para>
+/// <see cref="StalledAfterNonDrainingCycles"/> = 2 because a single non-draining
+/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the
+/// stalled signal should only fire when the backlog persists across cycles, which is
+/// the symptom the central health surface is asking us to detect.
+/// </para>
+/// </remarks>
+public sealed class SiteAuditReconciliationOptions
+{
+    /// <summary>
+    /// Period of the reconciliation tick. Each tick visits every known site once.
+    /// </summary>
+    public int ReconciliationIntervalSeconds { get; set; } = 300;
+
+    /// <summary>
+    /// Test-only override for finer control over the tick cadence than
+    /// whole-second resolution allows. When non-null, takes precedence over
+    /// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
+    /// production config exposes <see cref="ReconciliationIntervalSeconds"/>
+    /// only.
+    /// </summary>
+    public TimeSpan? ReconciliationIntervalOverride { get; set; }
+
+    /// <summary>
+    /// Resolves the effective tick interval, honouring the test override when
+    /// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
+    /// </summary>
+    public TimeSpan ReconciliationInterval =>
+        ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
+
+    /// <summary>
+    /// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
+    /// rows requested in a single <c>PullAuditEvents</c> RPC call.
+    /// </summary>
+    public int BatchSize { get; set; } = 256;
+
+    /// <summary>
+    /// Number of consecutive non-draining cycles (events returned AND
+    /// <c>MoreAvailable=true</c>) that must accumulate for a site before the actor
+    /// publishes <c>SiteAuditTelemetryStalledChanged(Stalled: true)</c> on the
+    /// EventStream.
+    /// </summary>
+    public int StalledAfterNonDrainingCycles { get; set; } = 2;
+}
@@ -0,0 +1,203 @@
+using System.Collections.Concurrent;
+using Akka.Actor;
+using Akka.Event;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the
+/// actor system's EventStream for <see cref="SiteAuditTelemetryStalledChanged"/>
+/// publications and maintains a per-site latched stalled-state map readable
+/// via <see cref="Snapshot"/>. Consumed by the M6 Bundle E
+/// <see cref="AuditCentralHealthSnapshot"/> aggregator so the central health
+/// surface can surface per-site "reconciliation isn't draining" without
+/// coupling the publisher (<see cref="SiteAuditReconciliationActor"/>) to the
+/// health collection plumbing.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Why an internal actor.</b> Akka.NET's <see cref="EventStream"/> only
+/// supports <see cref="IActorRef"/> subscribers — there is no callback or
+/// channel-based overload. The tracker therefore spawns a small subscriber
+/// actor that forwards each event into the shared
+/// <see cref="ConcurrentDictionary{TKey,TValue}"/> on the actor's thread, and
+/// readers (<see cref="Snapshot"/>) take a copy off that dictionary on any
+/// thread. Mirrors the <c>DeadLetterMonitorActor</c> shape — subscribe in
+/// <see cref="ActorBase.PreStart"/>, unsubscribe in
+/// <see cref="ActorBase.PostStop"/>, which the tracker triggers via a Stop
+/// at <see cref="Dispose"/>.
+/// </para>
+/// <para>
+/// <b>Per-site latching.</b> The publisher (<see cref="SiteAuditReconciliationActor"/>)
+/// only publishes on stalled-state transitions, so the dictionary is the
+/// authoritative latched state. Sites that have never published are absent
+/// from the snapshot — the consumer surface treats absence as
+/// <c>Stalled=false</c> (default healthy), the same default the reconciliation
+/// actor's own internal latch uses.
+/// </para>
+/// <para>
+/// <b>Singleton lifecycle.</b> Registered as a singleton via
+/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>;
+/// <see cref="Dispose"/> tears the internal subscriber down at host shutdown.
+/// </para>
+/// </remarks>
+public sealed class SiteAuditTelemetryStalledTracker : IDisposable
+{
+    private readonly EventStream _eventStream;
+    private readonly ConcurrentDictionary<string, bool> _state = new();
+    private readonly IActorRef? _subscriber;
+    private readonly AuditCentralHealthSnapshot? _snapshot;
+    private bool _disposed;
+
+    /// <summary>
+    /// Construct around a bare <see cref="EventStream"/>. Intended for unit
+    /// tests where the caller wants to publish events without standing up an
+    /// actor system — the tracker registers a transient subscriber actor only
+    /// if the supplied stream is backed by an actor system. In the bare-stream
+    /// mode (no actor system) the tracker still exposes the
+    /// <see cref="Snapshot"/> surface but cannot self-subscribe; production
+    /// callers always go through <see cref="SiteAuditTelemetryStalledTracker(ActorSystem)"/>.
+    /// </summary>
+    /// <remarks>
+    /// Subscribing to <see cref="EventStream"/> requires an <see cref="IActorRef"/>,
+    /// which can only be created from an <see cref="ActorSystem"/>. The bare-
+    /// stream ctor therefore can NOT itself wire the subscriber — tests that
+    /// want event-driven updates must use the ActorSystem ctor (or push state
+    /// directly via <see cref="Apply"/>). The tests in
+    /// <c>SiteAuditTelemetryStalledTrackerTests</c> use the ActorSystem ctor
+    /// via Akka.TestKit so they exercise the production subscribe path.
+    /// </remarks>
+    /// <param name="eventStream">The actor system event stream to observe.</param>
+    public SiteAuditTelemetryStalledTracker(EventStream eventStream)
+        : this(eventStream, snapshot: null)
+    {
+    }
+
+    /// <summary>
+    /// Bare-stream ctor with an optional snapshot sink — the central
+    /// composition root passes the singleton
+    /// <see cref="AuditCentralHealthSnapshot"/> so every dictionary update
+    /// also lands on the central health surface. The bare ctor still cannot
+    /// subscribe (no actor system), but tests that drive the tracker via
+    /// <see cref="Apply"/> get the snapshot push for free.
+    /// </summary>
+    /// <param name="eventStream">The actor system event stream to observe.</param>
+    /// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
+    public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot)
+    {
+        _eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream));
+        // No subscriber actor — see the remarks on the parameterless overload.
+        _subscriber = null;
+        _snapshot = snapshot;
+    }
+
+    /// <summary>
+    /// Production ctor: subscribes a small internal actor to the supplied
+    /// system's EventStream so every published
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> updates the latched
+    /// per-site map. <see cref="Dispose"/> tears the subscriber down.
+    /// </summary>
+    /// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
+    public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem)
+        : this(actorSystem, snapshot: null)
+    {
+    }
+
+    /// <summary>
+    /// Production ctor with a snapshot sink — every observed
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> is mirrored onto the
+    /// shared <see cref="AuditCentralHealthSnapshot"/> so the central health
+    /// surface sees per-site stalled state without re-reading the tracker.
+    /// </summary>
+    /// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
+    /// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
+    public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot)
+    {
+        ArgumentNullException.ThrowIfNull(actorSystem);
+        _eventStream = actorSystem.EventStream;
+        _snapshot = snapshot;
+        // Anonymous subscriber actor scoped to the system; props build it
+        // with a callback into THIS tracker's Apply method so the actor's
+        // single-threaded receive serialises every dictionary write.
+        _subscriber = actorSystem.ActorOf(
+            Props.Create(() => new StalledChangedSubscriber(this)),
+            name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}");
+        // Subscribe synchronously from the ctor so the subscription is in
+        // place before the tracker is returned to the caller — the actor's
+        // own PreStart runs asynchronously and would otherwise race the
+        // first publish. EventStream.Subscribe is thread-safe.
+        _eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged));
+    }
+
+    /// <summary>
+    /// Returns a defensive copy of the per-site latched stalled state.
+    /// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
+    /// </summary>
+    public IReadOnlyDictionary<string, bool> Snapshot() =>
+        new Dictionary<string, bool>(_state);
+
+    /// <summary>
+    /// Applied by the internal subscriber actor on every
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> publication. Exposed
+    /// internally so tests against the bare-stream ctor can still drive the
+    /// tracker, but the production path always goes through the actor.
+    /// </summary>
+    /// <param name="evt">The stalled-state change event to apply.</param>
+    internal void Apply(SiteAuditTelemetryStalledChanged evt)
+    {
+        if (evt is null) return;
+        _state[evt.SiteId] = evt.Stalled;
+        // Mirror into the central health snapshot if wired so a reader of
+        // IAuditCentralHealthSnapshot sees the same per-site state without
+        // a second lookup. Snapshot is optional (test composition roots may
+        // skip it) so the null-coalesce is the safe path.
+        _snapshot?.ApplyStalled(evt);
+    }
+
+    /// <summary>
+    /// Disposes the tracker and tears down the internal subscriber actor.
+    /// </summary>
+    public void Dispose()
+    {
+        if (_disposed) return;
+        _disposed = true;
+        if (_subscriber is not null)
+        {
+            // Unsubscribe runs in PostStop on the subscriber actor; Stop is
+            // fire-and-forget but the actor's PostStop hook is guaranteed to
+            // run before its mailbox is collected.
+            _subscriber.Tell(PoisonPill.Instance);
+        }
+    }
+
+    /// <summary>
+    /// Internal subscriber actor — receives every
+    /// <see cref="SiteAuditTelemetryStalledChanged"/> off the EventStream and
+    /// forwards it into the parent <see cref="SiteAuditTelemetryStalledTracker"/>.
+    /// Unlike <c>DeadLetterMonitorActor</c>, the subscription is registered by
+    /// the tracker constructor BEFORE this actor begins processing messages so
+    /// publishes that arrive between actor creation and PreStart cannot be
+    /// missed. Unsubscribe still runs in <see cref="PostStop"/>.
+    /// </summary>
+    private sealed class StalledChangedSubscriber : ReceiveActor
+    {
+        private readonly SiteAuditTelemetryStalledTracker _parent;
+
+        /// <summary>
+        /// Initializes a new subscriber actor that forwards events to the given tracker.
+        /// </summary>
+        /// <param name="parent">The parent tracker whose <see cref="Apply"/> method will be called for each event.</param>
+        public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent)
+        {
+            _parent = parent;
+            Receive<SiteAuditTelemetryStalledChanged>(evt => _parent.Apply(evt));
+        }
+
+        /// <inheritdoc />
+        protected override void PostStop()
+        {
+            Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged));
+            base.PostStop();
+        }
+    }
+}