refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,165 @@
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Composes the primary <see cref="SqliteAuditWriter"/> with a drop-oldest
+/// <see cref="RingBufferFallback"/>. Audit writes are best-effort by contract
+/// (see <see cref="IAuditWriter"/>) — a primary failure must NEVER bubble out
+/// to the calling script. Failed events are stashed in the ring; on the next
+/// successful primary write the ring is drained back through the primary in
+/// FIFO order.
+/// </summary>
+/// <remarks>
+/// <para>
+/// Each primary failure increments <see cref="IAuditWriteFailureCounter"/> so
+/// Site Health Monitoring can surface a sustained outage as
+/// <c>SiteAuditWriteFailures</c> (Bundle G).
+/// </para>
+/// <para>
+/// Errors raised by the ring drain on recovery are logged and silently dropped
+/// so we don't loop the failure mode — the trigger event itself succeeded, and
+/// retrying the drain on the NEXT successful write is the recovery path.
+/// </para>
+/// </remarks>
+public sealed class FallbackAuditWriter : IAuditWriter
+{
+    private readonly IAuditWriter _primary;
+    private readonly RingBufferFallback _ring;
+    private readonly IAuditWriteFailureCounter _failureCounter;
+    private readonly ILogger<FallbackAuditWriter> _logger;
+    private readonly IAuditPayloadFilter _filter;
+    private readonly SemaphoreSlim _drainGate = new(1, 1);
+
+    /// <summary>
+    /// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
+    /// here so every event written via the site hot path is truncated +
+    /// header/body/SQL-param redacted before it hits both the primary SQLite
+    /// writer AND the ring fallback. The parameter is optional (defaults to
+    /// no filtering) so the long tail of test composition roots that don't
+    /// care about the filter need no change — the production
+    /// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
+    /// always passes the real filter through.
+    /// </summary>
+    /// <param name="primary">The primary audit writer (typically the SQLite writer).</param>
+    /// <param name="ring">Drop-oldest ring buffer used to stash events when the primary fails.</param>
+    /// <param name="failureCounter">Counter incremented on each primary failure for health reporting.</param>
+    /// <param name="logger">Logger for diagnostics.</param>
+    /// <param name="filter">Optional payload filter applied before writing; null means no filtering.</param>
+    public FallbackAuditWriter(
+        IAuditWriter primary,
+        RingBufferFallback ring,
+        IAuditWriteFailureCounter failureCounter,
+        ILogger<FallbackAuditWriter> logger,
+        IAuditPayloadFilter? filter = null)
+    {
+        _primary = primary ?? throw new ArgumentNullException(nameof(primary));
+        _ring = ring ?? throw new ArgumentNullException(nameof(ring));
+        _failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+        // AuditLog-008: never default to a null filter — over-redact instead.
+        // SafeDefaultAuditPayloadFilter.Instance performs HTTP header
+        // redaction with the hard-coded sensitive defaults (Authorization,
+        // X-Api-Key, Cookie, Set-Cookie) so a test composition root that
+        // doesn't bind the real options never persists those headers
+        // verbatim. The real DefaultAuditPayloadFilter (truncation + body /
+        // SQL-param redaction) is wired by AddAuditLog and takes precedence.
+        _filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
+    }
+
+    /// <inheritdoc />
+    public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(evt);
+
+        // Filter once, up-front. The filtered event flows BOTH to the primary
+        // and (on failure) to the ring buffer — so a primary outage that
+        // drains later still hands the SqliteAuditWriter a row that has
+        // already been truncated and redacted. The filter contract is
+        // "MUST NOT throw". AuditLog-008: _filter is now non-null (defaults
+        // to SafeDefaultAuditPayloadFilter so header redaction is always
+        // applied even in composition roots that don't wire the real filter).
+        var filtered = _filter.Apply(evt);
+
+        try
+        {
+            await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            // Primary down: record the failure, stash in the ring, return
+            // success to the caller. Audit-write failures NEVER abort the
+            // user-facing action (alog.md §7). DO NOT attempt the ring drain
+            // here — primary is throwing, draining would just scramble FIFO
+            // order across re-enqueues.
+            _failureCounter.Increment();
+            _logger.LogWarning(ex,
+                "Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
+                filtered.EventId);
+            // Ring stores the filtered copy so the eventual drain replays a
+            // payload that has already been capped/redacted — no second
+            // filter pass needed on recovery, and no risk of the ring
+            // holding the raw oversized blob in memory.
+            _ring.TryEnqueue(filtered);
+            return;
+        }
+
+        // Primary succeeded — opportunistically drain anything that piled up
+        // in the ring during the outage. Best-effort: a failure during the
+        // drain re-enqueues the popped event and is logged; the next
+        // successful write will retry. Drain order in the audit log is
+        // therefore: <triggering event>, <backlog FIFO>.
+        if (_ring.Count > 0)
+        {
+            await TryDrainRingAsync(ct).ConfigureAwait(false);
+        }
+    }
+
+    private async Task TryDrainRingAsync(CancellationToken ct)
+    {
+        // Serialise drains so two concurrent recoveries don't double-replay.
+        if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false))
+        {
+            return;
+        }
+
+        try
+        {
+            // Pull only what is currently buffered; do NOT wait for new events.
+            // We iterate with a snapshot of Count so we never starve under
+            // concurrent enqueues.
+            var pending = _ring.Count;
+            for (var i = 0; i < pending; i++)
+            {
+                if (!_ring.TryDequeue(out var queued))
+                {
+                    break;
+                }
+
+                try
+                {
+                    await _primary.WriteAsync(queued, ct).ConfigureAwait(false);
+                }
+                catch (Exception ex)
+                {
+                    // Primary fell over again. Put the event back at the head
+                    // of the queue is impossible with Channel<T>; route to the
+                    // tail (drop-oldest preserves the most-recent picture).
+                    _failureCounter.Increment();
+                    _logger.LogWarning(ex,
+                        "Ring drain re-throw on EventId {EventId}; re-enqueuing.",
+                        queued.EventId);
+                    _ring.TryEnqueue(queued);
+                    break;
+                }
+            }
+        }
+        finally
+        {
+            _drainGate.Release();
+        }
+    }
+}
@@ -0,0 +1,50 @@
+using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
+using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Audit Log (#23) M5 Bundle C — bridges
+/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
+/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
+/// parameter redactor stage throws and the filter has to over-redact the
+/// offending field) into <see cref="ISiteHealthCollector"/> so the count
+/// surfaces in the site health report payload as
+/// <c>SiteHealthReport.AuditRedactionFailure</c>.
+/// </summary>
+/// <remarks>
+/// <para>
+/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
+/// callers must register <c>AddHealthMonitoring()</c> first so
+/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="ServiceCollectionExtensions.AddAuditLog"/>
+/// registration keeps <see cref="NoOpAuditRedactionFailureCounter"/> for nodes
+/// where Site Health Monitoring is not wired (the silent-sink contract —
+/// redaction failures must NEVER abort the user-facing action, alog.md §7).
+/// </para>
+/// <para>
+/// Mirrors the M2 Bundle G <see cref="HealthMetricsAuditWriteFailureCounter"/>
+/// shape one-for-one so the two health-metric bridges age together.
+/// </para>
+/// <para>
+/// Site-side only for M5: the redaction filter also runs on the central
+/// writers (CentralAuditWriter + AuditLogIngestActor), but the central
+/// health-metric surface that would expose <c>AuditRedactionFailure</c>
+/// alongside the existing central counters ships in M6. Until then, the
+/// central composition root keeps the NoOp default — the redactions still
+/// happen, they just don't get counted into a health report.
+/// </para>
+/// </remarks>
+public sealed class HealthMetricsAuditRedactionFailureCounter : IAuditRedactionFailureCounter
+{
+    private readonly ISiteHealthCollector _collector;
+
+    /// <summary>Initializes the counter with the site health collector it bridges into.</summary>
+    /// <param name="collector">The site health collector that receives the incremented redaction-failure count.</param>
+    public HealthMetricsAuditRedactionFailureCounter(ISiteHealthCollector collector)
+    {
+        _collector = collector ?? throw new ArgumentNullException(nameof(collector));
+    }
+
+    /// <inheritdoc/>
+    public void Increment() => _collector.IncrementAuditRedactionFailure();
+}
@@ -0,0 +1,37 @@
+using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Audit Log (#23) M2 Bundle G — bridges <see cref="IAuditWriteFailureCounter"/>
+/// (incremented by <see cref="FallbackAuditWriter"/> every time the primary
+/// SQLite writer throws) into <see cref="ISiteHealthCollector"/> so the count
+/// surfaces in the site health report payload as
+/// <c>SiteHealthReport.SiteAuditWriteFailures</c>.
+/// </summary>
+/// <remarks>
+/// <para>
+/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
+/// callers must register <c>AddHealthMonitoring()</c> first so
+/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="AddAuditLog"/>
+/// registration keeps <see cref="NoOpAuditWriteFailureCounter"/> for nodes
+/// where Site Health Monitoring is not wired (the silent-sink contract — audit
+/// write failures must NEVER abort the user-facing action, alog.md §7).
+/// </para>
+/// </remarks>
+public sealed class HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter
+{
+    private readonly ISiteHealthCollector _collector;
+
+    /// <summary>
+    /// Initializes a new <see cref="HealthMetricsAuditWriteFailureCounter"/> backed by the given health collector.
+    /// </summary>
+    /// <param name="collector">The site health collector to increment on each audit write failure.</param>
+    public HealthMetricsAuditWriteFailureCounter(ISiteHealthCollector collector)
+    {
+        _collector = collector ?? throw new ArgumentNullException(nameof(collector));
+    }
+
+    /// <inheritdoc/>
+    public void Increment() => _collector.IncrementSiteAuditWriteFailures();
+}
@@ -0,0 +1,14 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Lightweight counter sink invoked by <see cref="FallbackAuditWriter"/> every
+/// time the primary <see cref="SqliteAuditWriter"/> throws on an audit write.
+/// Bundle G (M2-T11) implements this as a thread-safe Interlocked counter
+/// bridged into the Site Health Monitoring report payload as
+/// <c>SiteAuditWriteFailures</c>.
+/// </summary>
+public interface IAuditWriteFailureCounter
+{
+    /// <summary>Increment the audit-write failure counter by one.</summary>
+    void Increment();
+}
@@ -0,0 +1,25 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Default <see cref="IAuditWriteFailureCounter"/> registered by
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/> on
+/// every node. Bundle G replaces this binding with a real counter that bridges
+/// into the Site Health Monitoring report payload as
+/// <c>SiteAuditWriteFailures</c> — until then,
+/// <see cref="FallbackAuditWriter"/> emits to a silent sink rather than NRE-ing
+/// on a null collaborator.
+/// </summary>
+/// <remarks>
+/// Audit-write failures must NEVER abort the user-facing action (alog.md §7),
+/// so the counter is best-effort by contract. A NoOp default is the correct
+/// safe fallback while the health metric is being wired in.
+/// </remarks>
+public sealed class NoOpAuditWriteFailureCounter : IAuditWriteFailureCounter
+{
+    /// <inheritdoc/>
+    public void Increment()
+    {
+        // Intentionally empty. Bundle G overrides this binding with the real
+        // counter once Site Health Monitoring is wired.
+    }
+}
@@ -0,0 +1,122 @@
+using System.Runtime.CompilerServices;
+using System.Threading.Channels;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Drop-oldest in-memory ring buffer used by <see cref="FallbackAuditWriter"/>
+/// when the primary SQLite writer is throwing. Capacity is fixed at construction
+/// (default 1024). When full, the oldest event is silently dropped to make room
+/// for the newest — preserving the most recent picture of activity in the face
+/// of an extended SQLite outage — and <see cref="RingBufferOverflowed"/> is
+/// raised so a health counter can record the loss.
+/// </summary>
+/// <remarks>
+/// <para>
+/// Backed by a <see cref="Channel{T}"/> with
+/// <see cref="BoundedChannelFullMode.DropOldest"/>. The channel doesn't natively
+/// notify on drop, so this class compares <c>Reader.Count</c> before and after
+/// each enqueue: any time we hit capacity and a subsequent enqueue keeps the
+/// count at capacity, exactly one event has been dropped.
+/// </para>
+/// <para>
+/// Per the M2 plan: the ring is the absolute-last-resort buffer for the
+/// hot-path; it is NOT a substitute for the bounded
+/// <see cref="SqliteAuditWriter"/> write queue.
+/// </para>
+/// </remarks>
+public sealed class RingBufferFallback
+{
+    private readonly Channel<AuditEvent> _channel;
+    private readonly int _capacity;
+
+    /// <summary>
+    /// Raised once each time a drop-oldest overflow occurs. Hooked by
+    /// <see cref="FallbackAuditWriter"/>'s health counter wiring.
+    /// </summary>
+    public event Action? RingBufferOverflowed;
+
+    /// <summary>Initializes the ring buffer with the specified fixed capacity.</summary>
+    /// <param name="capacity">Maximum number of events to buffer; must be greater than zero. Default is 1024.</param>
+    public RingBufferFallback(int capacity = 1024)
+    {
+        if (capacity <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(capacity), "capacity must be > 0.");
+        }
+
+        _capacity = capacity;
+        _channel = Channel.CreateBounded<AuditEvent>(new BoundedChannelOptions(capacity)
+        {
+            FullMode = BoundedChannelFullMode.DropOldest,
+            SingleReader = true,
+            SingleWriter = false,
+        });
+    }
+
+    /// <summary>Current event count in the ring (for diagnostics/tests).</summary>
+    public int Count => _channel.Reader.Count;
+
+    /// <summary>
+    /// Try to enqueue an event. Returns <see langword="true"/> on success (even
+    /// when an overflow caused an older event to be dropped); returns
+    /// <see langword="false"/> only when the ring has been
+    /// <see cref="Complete"/>-d.
+    /// </summary>
+    /// <param name="evt">The audit event to enqueue.</param>
+    /// <returns><see langword="true"/> if enqueued (or enqueued with overflow); <see langword="false"/> when the channel is completed.</returns>
+    public bool TryEnqueue(AuditEvent evt)
+    {
+        ArgumentNullException.ThrowIfNull(evt);
+
+        // DropOldest TryWrite always succeeds unless the channel is completed.
+        // Detect overflow by comparing the count before vs. after: if we were
+        // already at capacity and remain at capacity, exactly one event was
+        // dropped to make room for evt.
+        var beforeCount = _channel.Reader.Count;
+        if (!_channel.Writer.TryWrite(evt))
+        {
+            return false;
+        }
+
+        if (beforeCount >= _capacity)
+        {
+            // The new event displaced an existing one.
+            RingBufferOverflowed?.Invoke();
+        }
+
+        return true;
+    }
+
+    /// <summary>
+    /// Drain the ring in FIFO order. Yields available events immediately and
+    /// then completes when the channel is empty AND <see cref="Complete"/> has
+    /// been called. Callers that only want to drain what's currently buffered
+    /// must call <see cref="Complete"/> first.
+    /// </summary>
+    /// <param name="cancellationToken">Cancellation token to abort the async enumeration.</param>
+    public async IAsyncEnumerable<AuditEvent> DrainAsync(
+        [EnumeratorCancellation] CancellationToken cancellationToken)
+    {
+        await foreach (var evt in _channel.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
+        {
+            yield return evt;
+        }
+    }
+
+    /// <summary>
+    /// Non-blocking single-item dequeue used by the
+    /// <see cref="FallbackAuditWriter"/> recovery path. Returns
+    /// <see langword="false"/> when the ring is empty.
+    /// </summary>
+    /// <param name="evt">When this returns <see langword="true"/>, contains the dequeued event.</param>
+    /// <returns><see langword="true"/> if an event was dequeued; <see langword="false"/> if the ring is empty.</returns>
+    public bool TryDequeue(out AuditEvent evt) => _channel.Reader.TryRead(out evt!);
+
+    /// <summary>
+    /// Mark the ring as no-more-writes. <see cref="DrainAsync"/> will yield the
+    /// remaining events and then complete.
+    /// </summary>
+    public void Complete() => _channel.Writer.TryComplete();
+}
@@ -0,0 +1,138 @@
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
+/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
+/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
+/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
+/// <c>SiteAuditBacklog</c> field on the site health report.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
+/// backlog requires the queue's write lock; doing it inline in
+/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
+/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
+/// a synchronous I/O call on the report path. The hosted-service pattern keeps
+/// the report path pure and the SQL probe off the report timing budget.
+/// </para>
+/// <para>
+/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
+/// across many reports, fine enough that the central dashboard never lags by
+/// more than one health-report interval. Tunable via
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
+/// if ops needs a different cadence; for M6 we hard-code the value because the
+/// brief calls it out explicitly.
+/// </para>
+/// <para>
+/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
+/// transient SQLite error never tears down the hosted service — the next tick
+/// retries. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
+/// "exception logged, not propagated" contract.
+/// </para>
+/// </remarks>
+public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
+{
+    /// <summary>
+    /// Default poll cadence. Half a typical 60 s health-report interval keeps
+    /// the snapshot fresh without spinning the SQL probe more often than
+    /// necessary.
+    /// </summary>
+    internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
+
+    private readonly ISiteAuditQueue _queue;
+    private readonly ISiteHealthCollector _collector;
+    private readonly ILogger<SiteAuditBacklogReporter> _logger;
+    private readonly TimeSpan _refreshInterval;
+    private CancellationTokenSource? _cts;
+    private Task? _loop;
+
+    /// <summary>Initializes a new instance of <see cref="SiteAuditBacklogReporter"/>.</summary>
+    /// <param name="queue">The site audit queue used to probe the backlog count.</param>
+    /// <param name="collector">The site health collector that receives the backlog snapshot.</param>
+    /// <param name="logger">Logger instance.</param>
+    /// <param name="refreshInterval">Poll interval override; defaults to <see cref="DefaultRefreshInterval"/> (30 s).</param>
+    public SiteAuditBacklogReporter(
+        ISiteAuditQueue queue,
+        ISiteHealthCollector collector,
+        ILogger<SiteAuditBacklogReporter> logger,
+        TimeSpan? refreshInterval = null)
+    {
+        _queue = queue ?? throw new ArgumentNullException(nameof(queue));
+        _collector = collector ?? throw new ArgumentNullException(nameof(collector));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+        _refreshInterval = refreshInterval ?? DefaultRefreshInterval;
+    }
+
+    /// <inheritdoc />
+    public Task StartAsync(CancellationToken ct)
+    {
+        // Linked CTS lets StopAsync's cancellation AND the host's shutdown
+        // token both terminate the loop; either side firing aborts the
+        // pending Task.Delay.
+        _cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+        _loop = Task.Run(() => RunLoopAsync(_cts.Token));
+        return Task.CompletedTask;
+    }
+
+    private async Task RunLoopAsync(CancellationToken ct)
+    {
+        // First tick runs immediately so the very first health report after
+        // process start carries a real backlog snapshot — without this the
+        // dashboard would show null for the first 30 s after a deploy.
+        await SafeProbeAsync(ct).ConfigureAwait(false);
+
+        while (!ct.IsCancellationRequested)
+        {
+            try
+            {
+                await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
+            }
+            catch (OperationCanceledException)
+            {
+                break;
+            }
+
+            await SafeProbeAsync(ct).ConfigureAwait(false);
+        }
+    }
+
+    private async Task SafeProbeAsync(CancellationToken ct)
+    {
+        try
+        {
+            var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
+            _collector.UpdateSiteAuditBacklog(snapshot);
+        }
+        catch (OperationCanceledException)
+        {
+            // Shutdown — let the outer loop exit cleanly.
+            throw;
+        }
+        catch (Exception ex)
+        {
+            // Catch-all is deliberate: the hosted service must survive every
+            // class of probe failure (transient SQLite lock contention, disk
+            // I/O hiccup, …) so the next tick gets a chance.
+            _logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
+        }
+    }
+
+    /// <inheritdoc />
+    public Task StopAsync(CancellationToken ct)
+    {
+        _cts?.Cancel();
+        return _loop ?? Task.CompletedTask;
+    }
+
+    /// <inheritdoc />
+    public void Dispose()
+    {
+        _cts?.Dispose();
+    }
+}
@@ -0,0 +1,913 @@
+using System.Threading.Channels;
+using Microsoft.Data.Sqlite;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Site-side SQLite hot-path writer for Audit Log (#23) events. Mirrors the
+/// <see cref="ZB.MOM.WW.ScadaBridge.SiteEventLogging.SiteEventLogger"/> design — a single
+/// owned <see cref="SqliteConnection"/> serialised behind a write lock, fed by a
+/// bounded <see cref="Channel{T}"/> drained on a dedicated background writer
+/// task — so script-thread callers never block on disk I/O.
+/// </summary>
+/// <remarks>
+/// <para>
+/// The schema is bootstrapped in the constructor (Bundle B-T1). The
+/// Channel-based <see cref="WriteAsync"/> hot-path + Bundle D
+/// <see cref="ReadPendingAsync"/> / <see cref="MarkForwardedAsync"/> support
+/// surface are wired in Bundle B-T2.
+/// </para>
+/// <para>
+/// Site rows always carry <see cref="AuditForwardState.Pending"/> on first
+/// insert; the central row-shape's <c>IngestedAtUtc</c> column does NOT live in
+/// the site SQLite schema — central stamps it on ingest.
+/// </para>
+/// </remarks>
+public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable, IDisposable
+{
+    // Microsoft.Data.Sqlite reports a generic SQLITE_CONSTRAINT (error code 19)
+    // on a PRIMARY KEY violation; the extended subcode 1555 (SQLITE_CONSTRAINT_PRIMARYKEY)
+    // is exposed via SqliteException.SqliteExtendedErrorCode but isn't reliably
+    // surfaced across all SQLite builds. We treat any constraint error on insert
+    // as a duplicate-eventid race and swallow it (first-write-wins) — the index
+    // on EventId is the only constraint on this table, so this scope is precise.
+    private const int SqliteErrorConstraint = 19;
+
+    private readonly SqliteConnection _connection;
+    // AuditLog-005: dedicated read-only connection used by GetBacklogStatsAsync,
+    // ReadPendingAsync, ReadPendingSinceAsync, and ReadForwardedAsync so a slow
+    // backlog scan (COUNT(*) over hundreds of thousands of Pending rows under a
+    // central outage) never parks the hot-path writer behind _writeLock.
+    // SQLite-with-WAL allows a second connection on the same file to read
+    // concurrently with the writer; the writer's WAL pragma is set in
+    // InitializeSchema before this connection is opened. The reader connection
+    // has its own _readLock because SqliteConnection itself is not thread-safe
+    // even in read-only mode — multiple read callers can otherwise interleave
+    // commands on the shared connection.
+    private readonly SqliteConnection _readConnection;
+    private readonly object _readLock = new();
+    private readonly SqliteAuditWriterOptions _options;
+    private readonly ILogger<SqliteAuditWriter> _logger;
+    private readonly INodeIdentityProvider _nodeIdentity;
+    private readonly object _writeLock = new();
+    private readonly Channel<PendingAuditEvent> _writeQueue;
+    private readonly Task _writerLoop;
+    private bool _disposed;
+
+    /// <summary>Initializes a new instance of the SqliteAuditWriter class.</summary>
+    /// <param name="options">Configuration options for the audit writer.</param>
+    /// <param name="logger">Logger instance.</param>
+    /// <param name="nodeIdentity">Node identity provider.</param>
+    /// <param name="connectionStringOverride">Optional connection string override.</param>
+    public SqliteAuditWriter(
+        IOptions<SqliteAuditWriterOptions> options,
+        ILogger<SqliteAuditWriter> logger,
+        INodeIdentityProvider nodeIdentity,
+        string? connectionStringOverride = null)
+    {
+        ArgumentNullException.ThrowIfNull(options);
+        ArgumentNullException.ThrowIfNull(logger);
+        ArgumentNullException.ThrowIfNull(nodeIdentity);
+
+        _options = options.Value;
+        _logger = logger;
+        _nodeIdentity = nodeIdentity;
+
+        var connectionString = connectionStringOverride
+            ?? $"Data Source={_options.DatabasePath};Cache=Shared";
+        _connection = new SqliteConnection(connectionString);
+        _connection.Open();
+
+        InitializeSchema();
+
+        // AuditLog-005: open a second connection for read-only callers
+        // (GetBacklogStatsAsync, ReadPendingAsync, ReadPendingSinceAsync,
+        // ReadForwardedAsync). InitializeSchema set journal_mode=WAL on the
+        // writer connection, which is a database-level setting that persists
+        // for the file — subsequent connections to the same file see WAL and
+        // can read concurrently with the writer without taking _writeLock.
+        // Reuse the same connection string so the read connection sees the
+        // same Data Source / Cache settings as the writer.
+        _readConnection = new SqliteConnection(connectionString);
+        _readConnection.Open();
+
+        _writeQueue = Channel.CreateBounded<PendingAuditEvent>(
+            new BoundedChannelOptions(_options.ChannelCapacity)
+            {
+                // The hot-path enqueue must back-pressure if the background
+                // writer falls behind; a higher-level fallback (Bundle B-T4)
+                // handles truly catastrophic primary failure with a drop-oldest
+                // ring buffer.
+                FullMode = BoundedChannelFullMode.Wait,
+                SingleReader = true,
+                SingleWriter = false,
+            });
+        _writerLoop = Task.Run(ProcessWriteQueueAsync);
+    }
+
+    private void InitializeSchema()
+    {
+        // auto_vacuum must be set before any table is created for it to take
+        // effect on a fresh database. INCREMENTAL lets a future
+        // `PRAGMA incremental_vacuum` shrink the file after the 7-day retention
+        // purge — see alog.md §10.
+        using (var pragmaCmd = _connection.CreateCommand())
+        {
+            pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
+            pragmaCmd.ExecuteNonQuery();
+        }
+
+        // AuditLog-005: enable WAL so a second connection on the same file can
+        // serve read-only callers (GetBacklogStatsAsync, ReadPendingAsync,
+        // ReadPendingSinceAsync, ReadForwardedAsync) concurrently with the
+        // batched writer, decoupling those reads from _writeLock. WAL is a
+        // database-level setting persisted in the file header; setting it on
+        // the writer connection means every connection opened to the file
+        // afterwards inherits WAL behaviour. PRAGMA journal_mode returns the
+        // mode actually adopted ("memory" for ":memory:" / shared-cache memory
+        // mode, "wal" for file-backed) — we don't error if WAL was rejected
+        // because the read connection's correctness does not depend on WAL
+        // itself, only its concurrency advantage does.
+        using (var pragmaCmd = _connection.CreateCommand())
+        {
+            pragmaCmd.CommandText = "PRAGMA journal_mode = WAL";
+            pragmaCmd.ExecuteNonQuery();
+        }
+
+        using var cmd = _connection.CreateCommand();
+        cmd.CommandText = """
+            CREATE TABLE IF NOT EXISTS AuditLog (
+                EventId            TEXT    NOT NULL,
+                OccurredAtUtc      TEXT    NOT NULL,
+                Channel            TEXT    NOT NULL,
+                Kind               TEXT    NOT NULL,
+                CorrelationId      TEXT    NULL,
+                SourceSiteId       TEXT    NULL,
+                SourceNode         TEXT    NULL,
+                SourceInstanceId   TEXT    NULL,
+                SourceScript       TEXT    NULL,
+                Actor              TEXT    NULL,
+                Target             TEXT    NULL,
+                Status             TEXT    NOT NULL,
+                HttpStatus         INTEGER NULL,
+                DurationMs         INTEGER NULL,
+                ErrorMessage       TEXT    NULL,
+                ErrorDetail        TEXT    NULL,
+                RequestSummary     TEXT    NULL,
+                ResponseSummary    TEXT    NULL,
+                PayloadTruncated   INTEGER NOT NULL,
+                Extra              TEXT    NULL,
+                ForwardState       TEXT    NOT NULL,
+                ExecutionId        TEXT    NULL,
+                ParentExecutionId  TEXT    NULL,
+                PRIMARY KEY (EventId)
+            );
+            CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
+                ON AuditLog (ForwardState, OccurredAtUtc);
+            """;
+        cmd.ExecuteNonQuery();
+
+        // Audit Log #23 (ExecutionId): additively add the ExecutionId column.
+        // CREATE TABLE IF NOT EXISTS above does NOT add columns to an AuditLog
+        // table that already exists from a pre-ExecutionId build, so an
+        // auditlog.db created by an older build needs the column ALTER-ed in.
+        // The file is durable across restart/failover by design (7-day
+        // retention), so without this step every WriteAsync on an upgraded
+        // deployment would bind $ExecutionId against a missing column and the
+        // best-effort write path would silently drop every site audit row.
+        // SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
+        // probed first and the ALTER skipped when already there. The column is
+        // nullable with no default, so any row written before this migration
+        // reads back ExecutionId = null (back-compat).
+        AddColumnIfMissing("ExecutionId", "TEXT NULL");
+
+        // Audit Log #23 (ParentExecutionId): same idempotent upgrade path as
+        // ExecutionId above. A deployment that already ran the ExecutionId
+        // branch has an auditlog.db with the 21-column schema and no
+        // ParentExecutionId column; CREATE TABLE IF NOT EXISTS cannot add it,
+        // so it is ALTER-ed in here. Nullable with no default — rows written
+        // before this migration read back ParentExecutionId = null.
+        AddColumnIfMissing("ParentExecutionId", "TEXT NULL");
+
+        // SourceNode stamping: same idempotent upgrade path as ExecutionId /
+        // ParentExecutionId above. A deployment that already ran the
+        // ParentExecutionId branch has an auditlog.db with the 22-column
+        // schema and no SourceNode column; CREATE TABLE IF NOT EXISTS cannot
+        // add it, so it is ALTER-ed in here. Nullable with no default — rows
+        // written before this migration read back SourceNode = null.
+        AddColumnIfMissing("SourceNode", "TEXT NULL");
+    }
+
+    /// <summary>
+    /// Audit Log #23: additively adds a column to <c>AuditLog</c> only when
+    /// it is not already present (used for <c>ExecutionId</c> and
+    /// <c>ParentExecutionId</c>). SQLite lacks <c>ADD COLUMN IF NOT EXISTS</c>,
+    /// so the schema is probed via <c>PRAGMA table_info</c> first. Idempotent —
+    /// safe to run on every <see cref="InitializeSchema"/>. Mirrors
+    /// <c>StoreAndForwardStorage.AddColumnIfMissingAsync</c>; kept synchronous
+    /// here to match the rest of this writer's bootstrap DDL.
+    /// </summary>
+    private void AddColumnIfMissing(string columnName, string columnDefinition)
+    {
+        using var probe = _connection.CreateCommand();
+        probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('AuditLog') WHERE name = $name";
+        probe.Parameters.AddWithValue("$name", columnName);
+        var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
+        if (exists)
+        {
+            return;
+        }
+
+        using var alter = _connection.CreateCommand();
+        // Column name + definition are caller-controlled constants, never user
+        // input — safe to interpolate (parameters are not permitted in DDL).
+        alter.CommandText = $"ALTER TABLE AuditLog ADD COLUMN {columnName} {columnDefinition}";
+        alter.ExecuteNonQuery();
+    }
+
+    /// <inheritdoc />
+    public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(evt);
+
+        // Site rows always carry a non-null ForwardState; central rows leave it
+        // null. Force Pending on enqueue so callers can pass a bare AuditEvent
+        // without thinking about site-vs-central provenance.
+        var siteEvt = evt.ForwardState is null
+            ? evt with { ForwardState = AuditForwardState.Pending }
+            : evt;
+
+        var pending = new PendingAuditEvent(siteEvt);
+
+        // CreateBounded(FullMode=Wait) means WriteAsync will await room rather
+        // than throw when full — exactly the hot-path back-pressure semantics
+        // we want.
+        if (!_writeQueue.Writer.TryWrite(pending))
+        {
+            // The writer is either completed (logger disposed) or the channel
+            // is at capacity. Fall back to the async path which honours the
+            // FullMode=Wait policy.
+            return WriteSlowPathAsync(pending, ct);
+        }
+
+        return pending.Completion.Task;
+    }
+
+    private async Task WriteSlowPathAsync(PendingAuditEvent pending, CancellationToken ct)
+    {
+        try
+        {
+            await _writeQueue.Writer.WriteAsync(pending, ct).ConfigureAwait(false);
+        }
+        catch (ChannelClosedException)
+        {
+            pending.Completion.TrySetException(
+                new ObjectDisposedException(nameof(SqliteAuditWriter),
+                    "Event could not be recorded: the audit writer has been disposed."));
+        }
+
+        await pending.Completion.Task.ConfigureAwait(false);
+    }
+
+    private async Task ProcessWriteQueueAsync()
+    {
+        var batch = new List<PendingAuditEvent>(_options.BatchSize);
+
+        // ReadAllAsync completes when the channel is marked complete (Dispose).
+        await foreach (var first in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
+        {
+            batch.Clear();
+            batch.Add(first);
+
+            // Pull additional ready events up to BatchSize. TryRead is non-
+            // blocking and lets us amortise the transaction overhead across a
+            // burst of concurrent enqueues.
+            while (batch.Count < _options.BatchSize &&
+                   _writeQueue.Reader.TryRead(out var next))
+            {
+                batch.Add(next);
+            }
+
+            FlushBatch(batch);
+        }
+    }
+
+    private void FlushBatch(IReadOnlyList<PendingAuditEvent> batch)
+    {
+        lock (_writeLock)
+        {
+            if (_disposed)
+            {
+                foreach (var pending in batch)
+                {
+                    pending.Completion.TrySetException(
+                        new ObjectDisposedException(nameof(SqliteAuditWriter),
+                            "Event could not be recorded: the audit writer was disposed before the write completed."));
+                }
+                return;
+            }
+
+            using var transaction = _connection.BeginTransaction();
+            try
+            {
+                using var cmd = _connection.CreateCommand();
+                cmd.Transaction = transaction;
+                cmd.CommandText = """
+                    INSERT INTO AuditLog (
+                        EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
+                        SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
+                        Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
+                        RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
+                        ExecutionId, ParentExecutionId
+                    ) VALUES (
+                        $EventId, $OccurredAtUtc, $Channel, $Kind, $CorrelationId,
+                        $SourceSiteId, $SourceNode, $SourceInstanceId, $SourceScript, $Actor, $Target,
+                        $Status, $HttpStatus, $DurationMs, $ErrorMessage, $ErrorDetail,
+                        $RequestSummary, $ResponseSummary, $PayloadTruncated, $Extra, $ForwardState,
+                        $ExecutionId, $ParentExecutionId
+                    );
+                    """;
+
+                var pEventId = cmd.Parameters.Add("$EventId", SqliteType.Text);
+                var pOccurredAt = cmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
+                var pChannel = cmd.Parameters.Add("$Channel", SqliteType.Text);
+                var pKind = cmd.Parameters.Add("$Kind", SqliteType.Text);
+                var pCorrelationId = cmd.Parameters.Add("$CorrelationId", SqliteType.Text);
+                var pSourceSiteId = cmd.Parameters.Add("$SourceSiteId", SqliteType.Text);
+                var pSourceNode = cmd.Parameters.Add("$SourceNode", SqliteType.Text);
+                var pSourceInstanceId = cmd.Parameters.Add("$SourceInstanceId", SqliteType.Text);
+                var pSourceScript = cmd.Parameters.Add("$SourceScript", SqliteType.Text);
+                var pActor = cmd.Parameters.Add("$Actor", SqliteType.Text);
+                var pTarget = cmd.Parameters.Add("$Target", SqliteType.Text);
+                var pStatus = cmd.Parameters.Add("$Status", SqliteType.Text);
+                var pHttpStatus = cmd.Parameters.Add("$HttpStatus", SqliteType.Integer);
+                var pDurationMs = cmd.Parameters.Add("$DurationMs", SqliteType.Integer);
+                var pErrorMessage = cmd.Parameters.Add("$ErrorMessage", SqliteType.Text);
+                var pErrorDetail = cmd.Parameters.Add("$ErrorDetail", SqliteType.Text);
+                var pRequestSummary = cmd.Parameters.Add("$RequestSummary", SqliteType.Text);
+                var pResponseSummary = cmd.Parameters.Add("$ResponseSummary", SqliteType.Text);
+                var pPayloadTruncated = cmd.Parameters.Add("$PayloadTruncated", SqliteType.Integer);
+                var pExtra = cmd.Parameters.Add("$Extra", SqliteType.Text);
+                var pForwardState = cmd.Parameters.Add("$ForwardState", SqliteType.Text);
+                var pExecutionId = cmd.Parameters.Add("$ExecutionId", SqliteType.Text);
+                var pParentExecutionId = cmd.Parameters.Add("$ParentExecutionId", SqliteType.Text);
+
+                foreach (var pending in batch)
+                {
+                    var e = pending.Event;
+                    pEventId.Value = e.EventId.ToString();
+                    pOccurredAt.Value = e.OccurredAtUtc.ToString("o");
+                    pChannel.Value = e.Channel.ToString();
+                    pKind.Value = e.Kind.ToString();
+                    pCorrelationId.Value = (object?)e.CorrelationId?.ToString() ?? DBNull.Value;
+                    pSourceSiteId.Value = (object?)e.SourceSiteId ?? DBNull.Value;
+                    // SourceNode-stamping: caller-provided value wins (preserves
+                    // rows reconciled in from other nodes via the same writer);
+                    // otherwise stamp from the local INodeIdentityProvider. The
+                    // event record itself is NOT mutated — stamping is at write
+                    // time only. If the provider also returns null (unconfigured
+                    // node), the row's SourceNode stays NULL — operators see
+                    // "needs config" via the schema, not a magic fallback string.
+                    var sourceNode = e.SourceNode ?? _nodeIdentity.NodeName;
+                    pSourceNode.Value = (object?)sourceNode ?? DBNull.Value;
+                    pSourceInstanceId.Value = (object?)e.SourceInstanceId ?? DBNull.Value;
+                    pSourceScript.Value = (object?)e.SourceScript ?? DBNull.Value;
+                    pActor.Value = (object?)e.Actor ?? DBNull.Value;
+                    pTarget.Value = (object?)e.Target ?? DBNull.Value;
+                    pStatus.Value = e.Status.ToString();
+                    pHttpStatus.Value = (object?)e.HttpStatus ?? DBNull.Value;
+                    pDurationMs.Value = (object?)e.DurationMs ?? DBNull.Value;
+                    pErrorMessage.Value = (object?)e.ErrorMessage ?? DBNull.Value;
+                    pErrorDetail.Value = (object?)e.ErrorDetail ?? DBNull.Value;
+                    pRequestSummary.Value = (object?)e.RequestSummary ?? DBNull.Value;
+                    pResponseSummary.Value = (object?)e.ResponseSummary ?? DBNull.Value;
+                    pPayloadTruncated.Value = e.PayloadTruncated ? 1 : 0;
+                    pExtra.Value = (object?)e.Extra ?? DBNull.Value;
+                    pForwardState.Value = (e.ForwardState ?? AuditForwardState.Pending).ToString();
+                    pExecutionId.Value = (object?)e.ExecutionId?.ToString() ?? DBNull.Value;
+                    pParentExecutionId.Value = (object?)e.ParentExecutionId?.ToString() ?? DBNull.Value;
+
+                    try
+                    {
+                        cmd.ExecuteNonQuery();
+                        pending.Completion.TrySetResult();
+                    }
+                    catch (SqliteException ex) when (ex.SqliteErrorCode == SqliteErrorConstraint)
+                    {
+                        // Duplicate EventId — first-write-wins (alog.md §11).
+                        // Treat as success: the lifecycle event is durably
+                        // recorded under the first writer's payload.
+                        _logger.LogDebug(ex,
+                            "Duplicate EventId {EventId} swallowed by SqliteAuditWriter",
+                            e.EventId);
+                        pending.Completion.TrySetResult();
+                    }
+                }
+
+                transaction.Commit();
+            }
+            catch (Exception ex)
+            {
+                transaction.Rollback();
+                _logger.LogError(ex, "SqliteAuditWriter batch insert failed; faulting {Count} pending events", batch.Count);
+                foreach (var pending in batch)
+                {
+                    pending.Completion.TrySetException(ex);
+                }
+            }
+        }
+    }
+
+    // AuditLog-001: cached-lifecycle audit kinds that ride the combined-telemetry
+    // drain (joined with the operational tracking row + pushed via
+    // IngestCachedTelemetryAsync into the central dual-write transaction).
+    // ReadPendingAsync EXCLUDES these so the audit-only drain doesn't double-emit
+    // them; ReadPendingCachedTelemetryAsync below is the dedicated read surface
+    // the new SiteAuditTelemetryActor cached-drain uses.
+    private static readonly string[] CachedTelemetryKindNames =
+    {
+        nameof(AuditKind.CachedSubmit),
+        nameof(AuditKind.ApiCallCached),
+        nameof(AuditKind.DbWriteCached),
+        nameof(AuditKind.CachedResolve),
+    };
+
+    /// <inheritdoc />
+    public Task<IReadOnlyList<AuditEvent>> ReadPendingAsync(int limit, CancellationToken ct = default)
+    {
+        if (limit <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
+        }
+
+        // AuditLog-005: read via the dedicated _readConnection so this scan
+        // (which can be expensive when the backlog grows under a central
+        // outage) does not block the batched writer on _writeLock. WAL mode
+        // gives us a stable snapshot of the table while writes proceed on the
+        // writer connection. _readLock serialises this connection across
+        // multiple concurrent read callers since SqliteConnection itself is
+        // not thread-safe.
+        // AuditLog-001: NOT IN ($cached1,$cached2,$cached3,$cached4) excludes the
+        // cached-lifecycle kinds — they flow through ReadPendingCachedTelemetryAsync
+        // + the combined-telemetry drain. Kind is stored as the enum's name (see
+        // FlushBatch's pKind.Value), so a string-IN against the constant kind
+        // names matches the on-disk shape exactly.
+        lock (_readLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _readConnection.CreateCommand();
+            cmd.CommandText = """
+                SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
+                       SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
+                       Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
+                       RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
+                       ExecutionId, ParentExecutionId
+                FROM   AuditLog
+                WHERE  ForwardState = $pending
+                  AND  Kind NOT IN ($k0, $k1, $k2, $k3)
+                ORDER  BY OccurredAtUtc ASC, EventId ASC
+                LIMIT  $limit;
+                """;
+            cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
+            cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
+            cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
+            cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
+            cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
+            cmd.Parameters.AddWithValue("$limit", limit);
+
+            var rows = new List<AuditEvent>(Math.Min(limit, 256));
+            using var reader = cmd.ExecuteReader();
+            while (reader.Read())
+            {
+                rows.Add(MapRow(reader));
+            }
+
+            return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
+        }
+    }
+
+    /// <inheritdoc />
+    public Task<IReadOnlyList<AuditEvent>> ReadPendingCachedTelemetryAsync(
+        int limit, CancellationToken ct = default)
+    {
+        if (limit <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
+        }
+
+        // AuditLog-001: dedicated read surface for the cached-call lifecycle
+        // drain — symmetric to ReadPendingAsync but filtered to the four
+        // cached AuditKinds. Same _readConnection + _readLock pattern so the
+        // hot-path writer is not contended.
+        lock (_readLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _readConnection.CreateCommand();
+            cmd.CommandText = """
+                SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
+                       SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
+                       Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
+                       RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
+                       ExecutionId, ParentExecutionId
+                FROM   AuditLog
+                WHERE  ForwardState = $pending
+                  AND  Kind IN ($k0, $k1, $k2, $k3)
+                ORDER  BY OccurredAtUtc ASC, EventId ASC
+                LIMIT  $limit;
+                """;
+            cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
+            cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
+            cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
+            cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
+            cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
+            cmd.Parameters.AddWithValue("$limit", limit);
+
+            var rows = new List<AuditEvent>(Math.Min(limit, 256));
+            using var reader = cmd.ExecuteReader();
+            while (reader.Read())
+            {
+                rows.Add(MapRow(reader));
+            }
+
+            return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
+        }
+    }
+
+    /// <summary>
+    /// Returns up to <paramref name="limit"/> rows in
+    /// <see cref="AuditForwardState.Forwarded"/>, oldest
+    /// <see cref="AuditEvent.OccurredAtUtc"/> first, with
+    /// <see cref="AuditEvent.EventId"/> as the deterministic tiebreaker. The
+    /// <see cref="AuditForwardState.Forwarded"/>-specific counterpart of
+    /// <see cref="ReadPendingAsync"/>; used by tests to assert a row reached the
+    /// <see cref="AuditForwardState.Forwarded"/> state specifically (unlike
+    /// <see cref="ReadPendingSinceAsync"/>, which also returns
+    /// <see cref="AuditForwardState.Pending"/> rows).
+    /// </summary>
+    /// <param name="limit">Maximum number of rows to return.</param>
+    /// <param name="ct">Cancellation token.</param>
+    public Task<IReadOnlyList<AuditEvent>> ReadForwardedAsync(int limit, CancellationToken ct = default)
+    {
+        if (limit <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
+        }
+
+        // AuditLog-005: mirror ReadPendingAsync — read via _readConnection /
+        // _readLock so this query never contends with the batched writer on
+        // _writeLock.
+        lock (_readLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _readConnection.CreateCommand();
+            cmd.CommandText = """
+                SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
+                       SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
+                       Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
+                       RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
+                       ExecutionId, ParentExecutionId
+                FROM   AuditLog
+                WHERE  ForwardState = $forwarded
+                ORDER  BY OccurredAtUtc ASC, EventId ASC
+                LIMIT  $limit;
+                """;
+            cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
+            cmd.Parameters.AddWithValue("$limit", limit);
+
+            var rows = new List<AuditEvent>(Math.Min(limit, 256));
+            using var reader = cmd.ExecuteReader();
+            while (reader.Read())
+            {
+                rows.Add(MapRow(reader));
+            }
+
+            return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
+        }
+    }
+
+    /// <inheritdoc />
+    public Task MarkForwardedAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(eventIds);
+        if (eventIds.Count == 0)
+        {
+            return Task.CompletedTask;
+        }
+
+        lock (_writeLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _connection.CreateCommand();
+            // Build a single IN (...) parameter list so we issue one UPDATE per
+            // batch regardless of size. Each id is bound as its own parameter,
+            // so no string concatenation of user data ever enters the SQL.
+            var sb = new System.Text.StringBuilder();
+            sb.Append("UPDATE AuditLog SET ForwardState = $forwarded WHERE EventId IN (");
+            for (int i = 0; i < eventIds.Count; i++)
+            {
+                if (i > 0) sb.Append(',');
+                var p = $"$id{i}";
+                sb.Append(p);
+                cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
+            }
+            sb.Append(");");
+            cmd.CommandText = sb.ToString();
+            cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
+
+            cmd.ExecuteNonQuery();
+            return Task.CompletedTask;
+        }
+    }
+
+    /// <inheritdoc />
+    public Task<IReadOnlyList<AuditEvent>> ReadPendingSinceAsync(
+        DateTime sinceUtc, int batchSize, CancellationToken ct = default)
+    {
+        if (batchSize <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0.");
+        }
+
+        // AuditLog-005: read via _readConnection / _readLock — same lock-
+        // decoupling as ReadPendingAsync.
+        lock (_readLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _readConnection.CreateCommand();
+            cmd.CommandText = """
+                SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
+                       SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
+                       Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
+                       RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
+                       ExecutionId, ParentExecutionId
+                FROM   AuditLog
+                WHERE  ForwardState IN ($pending, $forwarded)
+                  AND  OccurredAtUtc >= $since
+                ORDER  BY OccurredAtUtc ASC, EventId ASC
+                LIMIT  $limit;
+                """;
+            cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
+            cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
+            // Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc
+            // is stored on insert ("o" format) — string comparison is monotonic for
+            // that encoding so we can index-scan against it.
+            cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString(
+                "o", System.Globalization.CultureInfo.InvariantCulture));
+            cmd.Parameters.AddWithValue("$limit", batchSize);
+
+            var rows = new List<AuditEvent>(Math.Min(batchSize, 256));
+            using var reader = cmd.ExecuteReader();
+            while (reader.Read())
+            {
+                rows.Add(MapRow(reader));
+            }
+
+            return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
+        }
+    }
+
+    /// <inheritdoc />
+    public Task MarkReconciledAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(eventIds);
+        if (eventIds.Count == 0)
+        {
+            return Task.CompletedTask;
+        }
+
+        lock (_writeLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            using var cmd = _connection.CreateCommand();
+            var sb = new System.Text.StringBuilder();
+            sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ")
+              .Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
+            for (int i = 0; i < eventIds.Count; i++)
+            {
+                if (i > 0) sb.Append(',');
+                var p = $"$id{i}";
+                sb.Append(p);
+                cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
+            }
+            sb.Append(");");
+            cmd.CommandText = sb.ToString();
+            cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString());
+            cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
+            cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
+
+            cmd.ExecuteNonQuery();
+            return Task.CompletedTask;
+        }
+    }
+
+    /// <inheritdoc />
+    public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
+    {
+        int pendingCount;
+        DateTime? oldestPending;
+
+        // AuditLog-005: read via the dedicated _readConnection (under
+        // _readLock) so this probe — polled every 30 s by SiteAuditBacklogReporter
+        // — never blocks the batched hot-path writer on _writeLock. Under a
+        // central outage the Pending backlog can grow to hundreds of thousands
+        // of rows and the COUNT(*) scan correspondingly stretches; that no
+        // longer adds tail latency to user-facing audit writes.
+        lock (_readLock)
+        {
+            ObjectDisposedException.ThrowIf(_disposed, this);
+
+            // Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
+            // index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
+            // index makes both aggregates cheap (count is a covering scan, min
+            // is the first key).
+            using var cmd = _readConnection.CreateCommand();
+            cmd.CommandText = """
+                SELECT COUNT(*), MIN(OccurredAtUtc)
+                FROM   AuditLog
+                WHERE  ForwardState = $pending;
+                """;
+            cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
+
+            using var reader = cmd.ExecuteReader();
+            reader.Read();
+            pendingCount = reader.GetInt32(0);
+            oldestPending = reader.IsDBNull(1)
+                ? null
+                : DateTime.Parse(reader.GetString(1),
+                    System.Globalization.CultureInfo.InvariantCulture,
+                    System.Globalization.DateTimeStyles.RoundtripKind);
+        }
+
+        // File-size lookup outside the lock — the DatabasePath option is the
+        // canonical source. The connection-string-override branch (used by
+        // some tests) keeps the same DatabasePath value, so this works
+        // uniformly. In-memory / mode=memory paths return 0 because the file
+        // doesn't exist on disk.
+        long onDiskBytes = 0;
+        try
+        {
+            if (!string.IsNullOrEmpty(_options.DatabasePath) &&
+                !_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
+                !_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
+                File.Exists(_options.DatabasePath))
+            {
+                onDiskBytes = new FileInfo(_options.DatabasePath).Length;
+            }
+        }
+        catch (Exception ex)
+        {
+            // File system probe is a best-effort health-metric — never abort
+            // a backlog snapshot because stat() failed. Log and report 0.
+            _logger.LogDebug(ex,
+                "SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
+                _options.DatabasePath);
+        }
+
+        return Task.FromResult(new SiteAuditBacklogSnapshot(
+            PendingCount: pendingCount,
+            OldestPendingUtc: oldestPending,
+            OnDiskBytes: onDiskBytes));
+    }
+
+    private static DateTime EnsureUtc(DateTime value) =>
+        value.Kind == DateTimeKind.Utc
+            ? value
+            : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
+
+    private static AuditEvent MapRow(SqliteDataReader reader)
+    {
+        return new AuditEvent
+        {
+            EventId = Guid.Parse(reader.GetString(0)),
+            OccurredAtUtc = DateTime.Parse(reader.GetString(1),
+                System.Globalization.CultureInfo.InvariantCulture,
+                System.Globalization.DateTimeStyles.RoundtripKind),
+            Channel = Enum.Parse<AuditChannel>(reader.GetString(2)),
+            Kind = Enum.Parse<AuditKind>(reader.GetString(3)),
+            CorrelationId = reader.IsDBNull(4) ? null : Guid.Parse(reader.GetString(4)),
+            SourceSiteId = reader.IsDBNull(5) ? null : reader.GetString(5),
+            SourceNode = reader.IsDBNull(6) ? null : reader.GetString(6),
+            SourceInstanceId = reader.IsDBNull(7) ? null : reader.GetString(7),
+            SourceScript = reader.IsDBNull(8) ? null : reader.GetString(8),
+            Actor = reader.IsDBNull(9) ? null : reader.GetString(9),
+            Target = reader.IsDBNull(10) ? null : reader.GetString(10),
+            Status = Enum.Parse<AuditStatus>(reader.GetString(11)),
+            HttpStatus = reader.IsDBNull(12) ? null : reader.GetInt32(12),
+            DurationMs = reader.IsDBNull(13) ? null : reader.GetInt32(13),
+            ErrorMessage = reader.IsDBNull(14) ? null : reader.GetString(14),
+            ErrorDetail = reader.IsDBNull(15) ? null : reader.GetString(15),
+            RequestSummary = reader.IsDBNull(16) ? null : reader.GetString(16),
+            ResponseSummary = reader.IsDBNull(17) ? null : reader.GetString(17),
+            PayloadTruncated = reader.GetInt32(18) != 0,
+            Extra = reader.IsDBNull(19) ? null : reader.GetString(19),
+            ForwardState = Enum.Parse<AuditForwardState>(reader.GetString(20)),
+            ExecutionId = reader.IsDBNull(21) ? null : Guid.Parse(reader.GetString(21)),
+            ParentExecutionId = reader.IsDBNull(22) ? null : Guid.Parse(reader.GetString(22)),
+        };
+    }
+
+    /// <summary>
+    /// Disposes the audit writer and releases resources.
+    /// </summary>
+    /// <remarks>
+    /// AuditLog-006: prefer <see cref="DisposeAsync"/> when possible (DI honours
+    /// <see cref="IAsyncDisposable"/> on singletons). The sync path remains for
+    /// callers that only know about <see cref="IDisposable"/> (e.g. legacy
+    /// composition roots, <c>using</c> statements without <c>await</c>). To
+    /// avoid the classic sync-over-async deadlock on a captured
+    /// <see cref="SynchronizationContext"/> (ASP.NET request thread, Akka
+    /// dispatcher under some configurations), we hop to the thread pool via
+    /// <see cref="Task.Run(Func{Task})"/> before blocking on the result — the
+    /// async continuation inside <see cref="DisposeAsync"/> then resumes on a
+    /// pool thread with no captured context, so <c>GetResult()</c> never waits
+    /// on the very thread the continuation needs.
+    /// </remarks>
+    public void Dispose()
+    {
+        Task.Run(async () => await DisposeAsync().ConfigureAwait(false))
+            .GetAwaiter().GetResult();
+    }
+
+    /// <summary>Asynchronously disposes the audit writer and releases resources.</summary>
+    public async ValueTask DisposeAsync()
+    {
+        Task? writerLoop;
+        lock (_writeLock)
+        {
+            if (_disposed) return;
+            // Stop accepting new events. Completing the channel writer is the
+            // shutdown signal: WriteAsync calls observe the completion and
+            // fault, and the writer loop drains any already-buffered items
+            // before exiting. _disposed is intentionally NOT set here — it
+            // flips only after the loop has fully drained (second lock block
+            // below), so FlushBatch's existing _disposed check guards the
+            // post-drain window when the connection is about to close.
+            _writeQueue.Writer.TryComplete();
+            writerLoop = _writerLoop;
+        }
+
+        // Wait outside the lock — the loop reacquires it for each batch.
+        try
+        {
+            if (writerLoop is not null)
+            {
+                await writerLoop.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(false);
+            }
+        }
+        catch (TimeoutException)
+        {
+            _logger.LogWarning("SqliteAuditWriter writer loop did not drain within 5s of dispose.");
+        }
+        catch (Exception ex)
+        {
+            // The loop's per-batch try/catch already routed individual failures
+            // to pending TCSes; a top-level fault here is unexpected.
+            _logger.LogError(ex, "SqliteAuditWriter writer loop faulted during dispose.");
+        }
+
+        lock (_writeLock)
+        {
+            if (_disposed) return;
+            _disposed = true;
+            _connection.Dispose();
+        }
+
+        // AuditLog-005: dispose the dedicated read connection after the writer
+        // is fully drained and closed. _readLock is taken to fence out any
+        // in-flight read caller that grabbed the lock before _disposed flipped
+        // — they observe ObjectDisposedException on the next attempt.
+        lock (_readLock)
+        {
+            _readConnection.Dispose();
+        }
+    }
+
+    /// <summary>An audit event awaiting persistence by the background writer.</summary>
+    private sealed class PendingAuditEvent
+    {
+        /// <summary>Initializes a new instance of the PendingAuditEvent class.</summary>
+        /// <param name="evt">The audit event to persist.</param>
+        public PendingAuditEvent(AuditEvent evt)
+        {
+            Event = evt;
+            Completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+        }
+
+        /// <summary>The audit event to persist.</summary>
+        public AuditEvent Event { get; }
+        /// <summary>Task completion source for write completion signaling.</summary>
+        public TaskCompletionSource Completion { get; }
+    }
+}
@@ -0,0 +1,27 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
+
+/// <summary>
+/// Options for the site-side SQLite hot-path audit writer.
+/// Mirrors the ZB.MOM.WW.ScadaBridge.SiteEventLogging pattern: a single SQLite connection
+/// fed by a background writer task draining a bounded
+/// <see cref="System.Threading.Channels.Channel{T}"/> so script-thread enqueues
+/// never block on disk I/O.
+/// </summary>
+public sealed class SqliteAuditWriterOptions
+{
+    /// <summary>SQLite database path (or in-memory URI for tests).</summary>
+    public string DatabasePath { get; set; } = "auditlog.db";
+
+    /// <summary>
+    /// Capacity of the bounded write queue. Set high enough that ordinary
+    /// script bursts never fill it; <see cref="System.Threading.Channels.BoundedChannelFullMode.Wait"/>
+    /// applies when the writer falls behind.
+    /// </summary>
+    public int ChannelCapacity { get; set; } = 4096;
+
+    /// <summary>Max number of pending events the writer drains in one transaction.</summary>
+    public int BatchSize { get; set; } = 256;
+
+    /// <summary>Soft flush interval the writer enforces when fewer than BatchSize events are queued.</summary>
+    public int FlushIntervalMs { get; set; } = 50;
+}
@@ -0,0 +1,236 @@
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): translates per-attempt
+/// notifications from the store-and-forward retry loop into one (or two)
+/// <see cref="CachedCallTelemetry"/> packets and pushes them through
+/// <see cref="ICachedCallTelemetryForwarder"/>.
+/// </summary>
+/// <remarks>
+/// <para>
+/// The S&amp;F loop's <see cref="ICachedCallLifecycleObserver"/> reports a
+/// single coarse outcome per attempt; the audit pipeline however models the
+/// lifecycle as TWO rows on terminal outcomes — an <c>Attempted</c>
+/// (<see cref="AuditKind.ApiCallCached"/> / <see cref="AuditKind.DbWriteCached"/>)
+/// row capturing the per-attempt mechanics, plus a <see cref="AuditKind.CachedResolve"/>
+/// row marking the terminal state for downstream consumers. The bridge fans
+/// out per outcome:
+/// </para>
+/// <list type="bullet">
+///   <item><description><c>TransientFailure</c> -> one Attempted(Failed) row.</description></item>
+///   <item><description><c>Delivered</c> -> Attempted(Delivered) + CachedResolve(Delivered).</description></item>
+///   <item><description><c>PermanentFailure</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
+///   <item><description><c>ParkedMaxRetries</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
+/// </list>
+/// <para>
+/// <b>Best-effort emission (alog.md §7):</b> the bridge itself never throws;
+/// the underlying forwarder swallows + logs its own failures.
+/// </para>
+/// </remarks>
+public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
+{
+    private readonly ICachedCallTelemetryForwarder _forwarder;
+    private readonly ILogger<CachedCallLifecycleBridge> _logger;
+
+    /// <summary>
+    /// SourceNode-stamping (Task 14): the local node identity provider used to
+    /// stamp <c>SiteCallOperational.SourceNode</c> on every cached-call
+    /// lifecycle row this bridge emits. Optional — when null (legacy hosts /
+    /// tests that don't register the provider) SourceNode stays null and
+    /// central persists the <c>SiteCalls</c> row with SourceNode NULL.
+    /// </summary>
+    private readonly INodeIdentityProvider? _nodeIdentity;
+
+    /// <summary>Initializes a new <see cref="CachedCallLifecycleBridge"/> with the given telemetry forwarder, logger, and optional node identity provider.</summary>
+    /// <param name="forwarder">The telemetry forwarder used to ship cached-call lifecycle events to central.</param>
+    /// <param name="logger">Logger for bridge diagnostics.</param>
+    /// <param name="nodeIdentity">Optional node identity provider used to stamp <c>SourceNode</c> on emitted telemetry rows.</param>
+    public CachedCallLifecycleBridge(
+        ICachedCallTelemetryForwarder forwarder,
+        ILogger<CachedCallLifecycleBridge> logger,
+        INodeIdentityProvider? nodeIdentity = null)
+    {
+        _forwarder = forwarder ?? throw new ArgumentNullException(nameof(forwarder));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+        _nodeIdentity = nodeIdentity;
+    }
+
+    /// <inheritdoc/>
+    public async Task OnAttemptCompletedAsync(
+        CachedCallAttemptContext context, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(context);
+
+        try
+        {
+            await EmitAttemptedAsync(context, ct).ConfigureAwait(false);
+
+            if (IsTerminal(context.Outcome))
+            {
+                await EmitResolveAsync(context, ct).ConfigureAwait(false);
+            }
+        }
+        catch (Exception ex)
+        {
+            // Defensive — both EmitX paths call the forwarder which is itself
+            // best-effort. A throw here is unexpected, but the alog.md §7
+            // contract requires we never propagate.
+            _logger.LogWarning(ex,
+                "CachedCallLifecycleBridge: unexpected throw for {TrackedOperationId} (Outcome {Outcome})",
+                context.TrackedOperationId, context.Outcome);
+        }
+    }
+
+    private async Task EmitAttemptedAsync(CachedCallAttemptContext context, CancellationToken ct)
+    {
+        // Per-attempt row: kind discriminates channel; status is always
+        // Attempted regardless of outcome (success vs. failure is captured
+        // by the companion HttpStatus / ErrorMessage fields, NOT by flipping
+        // the status — CachedResolve carries the terminal Status). Per the
+        // M3 brief and alog.md §4.
+        var kind = ChannelToAttemptKind(context.Channel);
+        var status = AuditStatus.Attempted;
+
+        var packet = BuildPacket(
+            context,
+            kind: kind,
+            status: status,
+            // Operational status mirror — for the per-attempt row the
+            // operational state is the running status; the bridge always
+            // writes "Attempted" so reconciliation can't roll back.
+            operationalStatus: "Attempted",
+            terminalAtUtc: null,
+            lastError: context.LastError,
+            httpStatus: context.HttpStatus);
+
+        await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
+    }
+
+    private async Task EmitResolveAsync(CachedCallAttemptContext context, CancellationToken ct)
+    {
+        var (auditStatus, operationalStatus) = TerminalOutcomeToStatuses(context.Outcome);
+
+        var packet = BuildPacket(
+            context,
+            kind: AuditKind.CachedResolve,
+            status: auditStatus,
+            operationalStatus: operationalStatus,
+            terminalAtUtc: context.OccurredAtUtc,
+            lastError: context.LastError,
+            httpStatus: context.HttpStatus);
+
+        await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
+    }
+
+    private CachedCallTelemetry BuildPacket(
+        CachedCallAttemptContext context,
+        AuditKind kind,
+        AuditStatus status,
+        string operationalStatus,
+        DateTime? terminalAtUtc,
+        string? lastError,
+        int? httpStatus)
+    {
+        var channel = ChannelStringToEnum(context.Channel);
+
+        return new CachedCallTelemetry(
+            Audit: new AuditEvent
+            {
+                EventId = Guid.NewGuid(),
+                OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
+                Channel = channel,
+                Kind = kind,
+                CorrelationId = context.TrackedOperationId.Value,
+                // Audit Log #23 (ExecutionId Task 4): the originating script
+                // execution's per-run correlation id, threaded through the S&F
+                // buffer; null on rows buffered before Task 4 (back-compat).
+                ExecutionId = context.ExecutionId,
+                // Audit Log #23 (ParentExecutionId Task 6): the spawning
+                // inbound-API request's ExecutionId, threaded through the S&F
+                // buffer alongside ExecutionId so the retry-loop cached rows
+                // correlate back to the cross-execution chain. Null for a
+                // non-routed run and on rows buffered before Task 6.
+                ParentExecutionId = context.ParentExecutionId,
+                SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
+                SourceInstanceId = context.SourceInstanceId,
+                // Audit Log #23 (ExecutionId Task 4): SourceScript is now
+                // threaded through the S&F buffer alongside ExecutionId — the
+                // retry-loop cached rows carry the same provenance the
+                // script-side cached rows do. Null on pre-Task-4 buffered rows.
+                SourceScript = context.SourceScript,
+                Target = context.Target,
+                Status = status,
+                HttpStatus = httpStatus,
+                DurationMs = context.DurationMs,
+                ErrorMessage = lastError,
+                ForwardState = AuditForwardState.Pending,
+            },
+            Operational: new SiteCallOperational(
+                TrackedOperationId: context.TrackedOperationId,
+                Channel: context.Channel,
+                Target: context.Target,
+                SourceSite: context.SourceSite,
+                // SourceNode-stamping (Task 14): the local cluster node name
+                // (node-a/node-b on a site). Stamped from the injected
+                // INodeIdentityProvider; null when no provider was wired so
+                // central persists SiteCalls.SourceNode as NULL.
+                SourceNode: _nodeIdentity?.NodeName,
+                Status: operationalStatus,
+                RetryCount: context.RetryCount,
+                LastError: lastError,
+                HttpStatus: httpStatus,
+                CreatedAtUtc: DateTime.SpecifyKind(context.CreatedAtUtc, DateTimeKind.Utc),
+                UpdatedAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
+                TerminalAtUtc: terminalAtUtc is null
+                    ? null
+                    : DateTime.SpecifyKind(terminalAtUtc.Value, DateTimeKind.Utc)));
+    }
+
+    private static AuditKind ChannelToAttemptKind(string channel) => channel switch
+    {
+        "ApiOutbound" => AuditKind.ApiCallCached,
+        "DbOutbound" => AuditKind.DbWriteCached,
+        // Defensive default — the S&F observer is filtered to cached-call
+        // categories so this branch shouldn't fire in practice.
+        _ => AuditKind.ApiCallCached,
+    };
+
+    private static AuditChannel ChannelStringToEnum(string channel) => channel switch
+    {
+        "ApiOutbound" => AuditChannel.ApiOutbound,
+        "DbOutbound" => AuditChannel.DbOutbound,
+        _ => AuditChannel.ApiOutbound,
+    };
+
+    private static (AuditStatus auditStatus, string operationalStatus) TerminalOutcomeToStatuses(
+        CachedCallAttemptOutcome outcome) => outcome switch
+    {
+        CachedCallAttemptOutcome.Delivered =>
+            (AuditStatus.Delivered, "Delivered"),
+        CachedCallAttemptOutcome.PermanentFailure =>
+            (AuditStatus.Parked, "Parked"),
+        CachedCallAttemptOutcome.ParkedMaxRetries =>
+            (AuditStatus.Parked, "Parked"),
+        // TransientFailure isn't terminal — see IsTerminal — but the switch
+        // is exhaustive so we route it through Failed for safety.
+        CachedCallAttemptOutcome.TransientFailure =>
+            (AuditStatus.Failed, "Failed"),
+        _ => (AuditStatus.Failed, "Failed"),
+    };
+
+    private static bool IsTerminal(CachedCallAttemptOutcome outcome) => outcome switch
+    {
+        CachedCallAttemptOutcome.Delivered => true,
+        CachedCallAttemptOutcome.PermanentFailure => true,
+        CachedCallAttemptOutcome.ParkedMaxRetries => true,
+        CachedCallAttemptOutcome.TransientFailure => false,
+        _ => false,
+    };
+}
@@ -0,0 +1,194 @@
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Site-side dual emitter for cached-call lifecycle telemetry (Audit Log #23 /
+/// M3). Sister to <see cref="SiteAuditTelemetryActor"/>: where the M2 actor
+/// drains audit-only events, this forwarder takes a combined
+/// <see cref="CachedCallTelemetry"/> packet and fans it out to the two
+/// site-local stores in a single call:
+/// <list type="bullet">
+///   <item><description>The <see cref="AuditEvent"/> row is written via
+///   <see cref="IAuditWriter"/> (the site <c>FallbackAuditWriter</c> +
+///   <c>SqliteAuditWriter</c> chain established in M2).</description></item>
+///   <item><description>The operational <see cref="SiteCallOperational"/> half
+///   updates the site-local <c>OperationTracking</c> SQLite store via
+///   <see cref="IOperationTrackingStore"/>, with the per-lifecycle method
+///   (<c>Enqueue</c> / <c>Attempt</c> / <c>Terminal</c>) selected from the
+///   audit row's <see cref="AuditKind"/>.</description></item>
+/// </list>
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Best-effort contract (alog.md §7):</b> a thrown writer OR a thrown
+/// tracking store must never propagate to the calling script. Both emission
+/// halves are wrapped in independent try/catch blocks so a SQLite outage on
+/// one side cannot starve the other — the failure is logged and the call
+/// returns normally.
+/// </para>
+/// <para>
+/// <b>Local-write only — the wire push is the drain actor's job.</b> This
+/// forwarder is deliberately synchronous against the two site-local SQLite
+/// stores and never pushes to central itself. The site→central transport is
+/// now live: <c>ClusterClientSiteAuditClient</c> is the production binding of
+/// <see cref="ISiteStreamAuditClient"/> on site roles (with
+/// <c>NoOpSiteStreamAuditClient</c> retained only for central/test composition
+/// roots). The push happens out-of-band: <see cref="SiteAuditTelemetryActor"/>
+/// sweeps the <c>AuditEvent</c> rows this forwarder wrote — they live in SQLite
+/// tagged <see cref="AuditForwardState.Pending"/> — and drains them to central
+/// via that client. A single drain loop therefore covers both the audit-only
+/// emissions and the cached-call emissions this forwarder produces.
+/// </para>
+/// </remarks>
+public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
+{
+    private readonly IAuditWriter _auditWriter;
+    private readonly IOperationTrackingStore? _trackingStore;
+    private readonly ILogger<CachedCallTelemetryForwarder> _logger;
+
+    /// <summary>
+    /// SourceNode-stamping (Task 14): local node identity provider used to
+    /// stamp the tracking-store row's <c>SourceNode</c> column on
+    /// <c>RecordEnqueueAsync</c>. Optional — when null (legacy / test hosts)
+    /// the column stays NULL on the tracking row.
+    /// </summary>
+    private readonly INodeIdentityProvider? _nodeIdentity;
+
+    /// <summary>
+    /// Construct the forwarder. <paramref name="trackingStore"/> is optional —
+    /// when null only the audit half of the packet is emitted, which matches
+    /// the M3 Bundle F composition-root contract on Central nodes: the
+    /// AuditLog DI surface registers the forwarder unconditionally (mirroring
+    /// the IAuditWriter chain) but the site-only tracking store has no central
+    /// registration. Production site nodes wire both — the central lazy
+    /// resolution is a no-op path kept symmetric with the M2 writer chain.
+    /// </summary>
+    /// <param name="auditWriter">Writer used to persist audit events from the telemetry packet.</param>
+    /// <param name="trackingStore">Optional store for updating operation tracking state; null on central nodes.</param>
+    /// <param name="logger">Logger for this forwarder.</param>
+    /// <param name="nodeIdentity">Optional provider of the current node name stamped on emitted rows.</param>
+    public CachedCallTelemetryForwarder(
+        IAuditWriter auditWriter,
+        IOperationTrackingStore? trackingStore,
+        ILogger<CachedCallTelemetryForwarder> logger,
+        INodeIdentityProvider? nodeIdentity = null)
+    {
+        _auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+        _trackingStore = trackingStore;
+        _nodeIdentity = nodeIdentity;
+    }
+
+    /// <inheritdoc />
+    public async Task ForwardAsync(CachedCallTelemetry telemetry, CancellationToken ct = default)
+    {
+        ArgumentNullException.ThrowIfNull(telemetry);
+
+        // Independent try/catch — a thrown audit writer must not prevent the
+        // tracking-store update from running (and vice-versa). Both halves
+        // are best-effort.
+        await TryEmitAuditAsync(telemetry, ct).ConfigureAwait(false);
+        await TryEmitTrackingAsync(telemetry, ct).ConfigureAwait(false);
+    }
+
+    private async Task TryEmitAuditAsync(CachedCallTelemetry telemetry, CancellationToken ct)
+    {
+        try
+        {
+            await _auditWriter.WriteAsync(telemetry.Audit, ct).ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            // alog.md §7 best-effort contract — log and swallow. The audit
+            // pipeline's own retry/recovery (RingBufferFallback in the
+            // FallbackAuditWriter) handles transient writer failures upstream;
+            // a throw bubbling up here means the writer's own swallow contract
+            // failed, which is itself best-effort-handled.
+            _logger.LogWarning(ex,
+                "CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
+                telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
+        }
+    }
+
+    private async Task TryEmitTrackingAsync(CachedCallTelemetry telemetry, CancellationToken ct)
+    {
+        if (_trackingStore is null)
+        {
+            // No site-local tracking store wired — Central composition root or
+            // an integration-test host that skipped AddSiteRuntime. Emitting
+            // through the audit half is still meaningful; the tracking half
+            // is a no-op rather than an error.
+            return;
+        }
+
+        try
+        {
+            switch (telemetry.Audit.Kind)
+            {
+                case AuditKind.CachedSubmit:
+                    // Enqueue — insert-if-not-exists with the operational
+                    // channel as the kind discriminator. RetryCount is fixed
+                    // at 0 by the tracking store's INSERT contract.
+                    // SourceNode-stamping (Task 14): stamp the local node
+                    // name (node-a/node-b) from the injected
+                    // INodeIdentityProvider; null when no provider was wired
+                    // so the tracking row's SourceNode column stays NULL.
+                    await _trackingStore.RecordEnqueueAsync(
+                        telemetry.Operational.TrackedOperationId,
+                        telemetry.Operational.Channel,
+                        telemetry.Operational.Target,
+                        telemetry.Audit.SourceInstanceId,
+                        telemetry.Audit.SourceScript,
+                        sourceNode: _nodeIdentity?.NodeName,
+                        ct).ConfigureAwait(false);
+                    break;
+
+                case AuditKind.ApiCallCached:
+                case AuditKind.DbWriteCached:
+                    // Attempt — advance retry counter + last-error/HTTP-status.
+                    // Terminal rows are guarded by the store's WHERE clause.
+                    await _trackingStore.RecordAttemptAsync(
+                        telemetry.Operational.TrackedOperationId,
+                        telemetry.Operational.Status,
+                        telemetry.Operational.RetryCount,
+                        telemetry.Operational.LastError,
+                        telemetry.Operational.HttpStatus,
+                        ct).ConfigureAwait(false);
+                    break;
+
+                case AuditKind.CachedResolve:
+                    // Terminal — first-write-wins on the resolve flip.
+                    await _trackingStore.RecordTerminalAsync(
+                        telemetry.Operational.TrackedOperationId,
+                        telemetry.Operational.Status,
+                        telemetry.Operational.LastError,
+                        telemetry.Operational.HttpStatus,
+                        ct).ConfigureAwait(false);
+                    break;
+
+                default:
+                    // Defensive — only the four cached-lifecycle kinds are
+                    // expected on this path. Anything else is logged so a
+                    // mis-routed packet is visible but never crashes the
+                    // forwarder.
+                    _logger.LogWarning(
+                        "CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
+                        telemetry.Audit.Kind, telemetry.Audit.EventId);
+                    break;
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex,
+                "CachedCallTelemetryForwarder: tracking-store emission threw for TrackedOperationId {Id} (Status {Status})",
+                telemetry.Operational.TrackedOperationId, telemetry.Operational.Status);
+        }
+    }
+}
@@ -0,0 +1,117 @@
+using Akka.Actor;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Production <see cref="ISiteStreamAuditClient"/> binding for site composition
+/// roots: pushes audit telemetry to central over Akka <c>ClusterClient</c> via
+/// the site's <c>SiteCommunicationActor</c>. The actor forwards the command to
+/// <c>/user/central-communication</c> and the central
+/// <c>CentralCommunicationActor</c> Asks the <c>AuditLogIngestActor</c> proxy —
+/// the same command/control transport notifications already use. Wired by the
+/// Host for site roles; central and test composition roots keep the
+/// <see cref="NoOpSiteStreamAuditClient"/> DI default (they have no
+/// <c>SiteCommunicationActor</c>).
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Throw-on-failure contract.</b> An Ask timeout or a faulted reply
+/// (<see cref="Status.Failure"/>) propagates as a thrown exception out of the
+/// <c>Ingest*Async</c> methods — it is NOT caught and turned into an empty ack.
+/// The <see cref="SiteAuditTelemetryActor"/> drain loop treats a thrown
+/// exception as transient and leaves the rows <c>Pending</c> for the next tick.
+/// Swallowing the fault into an empty ack would be indistinguishable from "zero
+/// rows accepted" and would silently lose the retry signal. Task 1 confirmed
+/// the central receiving end does not collapse an ingest fault into an empty
+/// ack either, so a site-side Ask through the whole path faults cleanly on a
+/// central-side timeout.
+/// </para>
+/// <para>
+/// The batches arrive as proto DTOs (<see cref="AuditEventBatch"/> /
+/// <see cref="CachedTelemetryBatch"/>) because the
+/// <see cref="SiteAuditTelemetryActor"/> builds them with
+/// <see cref="AuditEventDtoMapper.ToDto"/>. This client converts them back into
+/// the <see cref="AuditEvent"/> / <see cref="SiteCall"/> entities the Akka
+/// command messages carry — the same DTO→entity translation the
+/// <c>SiteStreamGrpcServer</c> performs for the gRPC reconciliation path.
+/// </para>
+/// </remarks>
+public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
+{
+    private readonly IActorRef _siteCommunicationActor;
+    private readonly TimeSpan _askTimeout;
+
+    /// <param name="siteCommunicationActor">
+    /// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
+    /// over the registered central ClusterClient and routes the reply back to
+    /// this client's Ask.
+    /// </param>
+    /// <param name="askTimeout">
+    /// Ask timeout for the round-trip to central. On expiry the Ask throws
+    /// <see cref="Akka.Actor.AskTimeoutException"/>, which the drain loop treats
+    /// as transient (rows stay <c>Pending</c>).
+    /// </param>
+    public ClusterClientSiteAuditClient(IActorRef siteCommunicationActor, TimeSpan askTimeout)
+    {
+        ArgumentNullException.ThrowIfNull(siteCommunicationActor);
+        _siteCommunicationActor = siteCommunicationActor;
+        _askTimeout = askTimeout;
+    }
+
+    /// <inheritdoc/>
+    public async Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
+    {
+        ArgumentNullException.ThrowIfNull(batch);
+
+        var events = new List<AuditEvent>(batch.Events.Count);
+        foreach (var dto in batch.Events)
+        {
+            events.Add(AuditEventDtoMapper.FromDto(dto));
+        }
+
+        // Ask<T> throws AskTimeoutException on timeout and rethrows a
+        // Status.Failure's inner cause — both surface as a thrown exception so
+        // the drain loop keeps the rows Pending. We deliberately do NOT catch.
+        var reply = await _siteCommunicationActor
+            .Ask<IngestAuditEventsReply>(new IngestAuditEventsCommand(events), _askTimeout, ct)
+            .ConfigureAwait(false);
+
+        return ToAck(reply.AcceptedEventIds);
+    }
+
+    /// <inheritdoc/>
+    public async Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
+    {
+        ArgumentNullException.ThrowIfNull(batch);
+
+        var entries = new List<CachedTelemetryEntry>(batch.Packets.Count);
+        foreach (var packet in batch.Packets)
+        {
+            var audit = AuditEventDtoMapper.FromDto(packet.AuditEvent);
+            var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
+            entries.Add(new CachedTelemetryEntry(audit, siteCall));
+        }
+
+        // Same throw-on-failure contract as IngestAuditEventsAsync. The reply
+        // type is IngestCachedTelemetryReply (the central dual-write reply),
+        // distinct from IngestAuditEventsReply.
+        var reply = await _siteCommunicationActor
+            .Ask<IngestCachedTelemetryReply>(new IngestCachedTelemetryCommand(entries), _askTimeout, ct)
+            .ConfigureAwait(false);
+
+        return ToAck(reply.AcceptedEventIds);
+    }
+
+    private static IngestAck ToAck(IReadOnlyList<Guid> acceptedEventIds)
+    {
+        var ack = new IngestAck();
+        foreach (var id in acceptedEventIds)
+        {
+            ack.AcceptedEventIds.Add(id.ToString());
+        }
+        return ack;
+    }
+}
@@ -0,0 +1,46 @@
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Mockable abstraction over the central site-audit push surface that
+/// <see cref="SiteAuditTelemetryActor"/> uses to forward <see cref="AuditEventBatch"/>
+/// payloads. The production implementation is
+/// <see cref="ClusterClientSiteAuditClient"/> — a ClusterClient-based client,
+/// wired in the Host for site roles, that forwards batches to central via the
+/// site's <c>SiteCommunicationActor</c>. Unit tests substitute via NSubstitute
+/// against this interface so the actor never needs a live transport.
+/// </summary>
+public interface ISiteStreamAuditClient
+{
+    /// <summary>
+    /// Forwards <paramref name="batch"/> to the central audit-ingest path. The
+    /// returned <see cref="IngestAck"/> carries the <c>accepted_event_ids</c>
+    /// the actor will flip to
+    /// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
+    /// in the site SQLite queue.
+    /// </summary>
+    /// <param name="batch">The batch of audit events to forward.</param>
+    /// <param name="ct">Cancellation token for the operation.</param>
+    Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
+
+    /// <summary>
+    /// Forwards the combined <see cref="CachedTelemetryBatch"/> (Audit Log #23)
+    /// to the central cached-telemetry ingest path. Each packet carries both the
+    /// audit row and the operational <c>SiteCalls</c> upsert; central writes both
+    /// in a single MS SQL transaction. Returns the same <see cref="IngestAck"/>
+    /// shape as <see cref="IngestAuditEventsAsync"/> so the site-side forwarder
+    /// can flip the underlying audit rows to
+    /// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
+    /// once central has acknowledged them.
+    /// </summary>
+    /// <remarks>
+    /// The production <see cref="ClusterClientSiteAuditClient"/> forwards over
+    /// the ClusterClient transport; the <see cref="NoOpSiteStreamAuditClient"/>
+    /// DI default (used by central and test composition roots) returns an empty
+    /// ack so no rows are flipped.
+    /// </remarks>
+    /// <param name="batch">The batch of cached-call telemetry packets to forward.</param>
+    /// <param name="ct">Cancellation token for the operation.</param>
+    Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
+}
@@ -0,0 +1,51 @@
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Default <see cref="ISiteStreamAuditClient"/> registered by
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/>.
+/// It is a no-op binding for composition roots that have no
+/// <c>SiteCommunicationActor</c> — central and test roots. Site roles override
+/// it in the Host with the ClusterClient-based
+/// <see cref="ClusterClientSiteAuditClient"/>, which actually forwards audit
+/// telemetry to central.
+/// </summary>
+/// <remarks>
+/// <para>
+/// Returns an empty <see cref="IngestAck"/> so the
+/// <see cref="SiteAuditTelemetryActor"/> doesn't flip any rows to
+/// <c>Forwarded</c> when this NoOp is in effect — rows stay <c>Pending</c>
+/// until a real client (or a test stub) takes over.
+/// </para>
+/// <para>
+/// Audit-write paths are best-effort by contract: a NoOp client keeps the
+/// host running cleanly and is consistent with "audit-write failures never
+/// abort the user-facing action".
+/// </para>
+/// </remarks>
+public sealed class NoOpSiteStreamAuditClient : ISiteStreamAuditClient
+{
+    private static readonly IngestAck EmptyAck = new();
+
+    /// <inheritdoc/>
+    public Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
+    {
+        ArgumentNullException.ThrowIfNull(batch);
+        // Empty ack — no EventIds will be flipped to Forwarded, so rows stay
+        // Pending until the real ClusterClientSiteAuditClient (or a test stub)
+        // takes over.
+        return Task.FromResult(EmptyAck);
+    }
+
+    /// <inheritdoc/>
+    public Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
+    {
+        ArgumentNullException.ThrowIfNull(batch);
+        // Empty ack — same rationale as IngestAuditEventsAsync. The site still
+        // writes the audit + tracking rows to its SQLite stores authoritatively;
+        // central-side state only materialises once the real
+        // ClusterClientSiteAuditClient (or a test stub) is wired in.
+        return Task.FromResult(EmptyAck);
+    }
+}
@@ -0,0 +1,464 @@
+using Akka.Actor;
+using Google.Protobuf.WellKnownTypes;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Types;
+using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Site-side actor that drains the local SQLite audit queue and pushes Pending
+/// rows to central via two parallel transports:
+/// <list type="bullet">
+///   <item><description><c>IngestAuditEvents</c> for the audit-only path —
+///   sync ApiCall/DbWrite, NotifySend, InboundRequest and similar single-row
+///   lifecycle events.</description></item>
+///   <item><description><c>IngestCachedTelemetry</c> for the combined-telemetry
+///   path — cached-call lifecycle rows (<c>CachedSubmit</c>,
+///   <c>ApiCallCached</c>/<c>DbWriteCached</c>, <c>CachedResolve</c>) joined
+///   with the matching <c>OperationTracking</c> row, written at central as a
+///   single dual-write transaction (AuditLog + SiteCalls).</description></item>
+/// </list>
+/// </summary>
+/// <remarks>
+/// <para>
+/// The drain self-ticks via two private messages — <c>Drain</c> for the
+/// audit-only path and <c>CachedDrain</c> for the combined path — each
+/// scheduled independently. Cadence is options-driven:
+/// <c>BusyIntervalSeconds</c> when the previous drain found rows (or faulted —
+/// we want quick recovery), <c>IdleIntervalSeconds</c> when the queue was empty.
+/// The two drains share the same cadence configuration but advance their own
+/// timers so a stall on one path does not block the other.
+/// </para>
+/// <para>
+/// Collaborators are injected as interfaces (<see cref="ISiteAuditQueue"/>,
+/// <see cref="ISiteStreamAuditClient"/>, optional
+/// <see cref="IOperationTrackingStore"/>) so unit tests substitute with
+/// NSubstitute and never touch real SQLite or gRPC. The
+/// <see cref="IOperationTrackingStore"/> is optional — central composition
+/// roots and tests that don't exercise the cached path can leave it null, in
+/// which case the cached-drain scheduler is never armed.
+/// </para>
+/// <para>
+/// Per Bundle D's brief, audit-write paths must be fail-safe — a thrown
+/// exception inside the actor MUST NOT crash it. Both Drain handlers wrap
+/// their pipelines in a top-level try/catch that logs and re-schedules; the
+/// actor's <see cref="SupervisorStrategy"/> defaults to
+/// <see cref="Akka.Actor.SupervisorStrategy.DefaultStrategy"/>'s Restart for
+/// child actors — but this actor has no children, so the catch is what
+/// matters.
+/// </para>
+/// <para>
+/// AuditLog-001: wires the previously-unreachable combined-telemetry transport.
+/// Prior to this the cached audit rows flowed through the audit-only drain via
+/// <c>IngestAuditEventsAsync</c> and the central <c>OnCachedTelemetryAsync</c>
+/// dual-write handler was dead production code; the operational <c>SiteCalls</c>
+/// half was never sent to central.
+/// </para>
+/// </remarks>
+public class SiteAuditTelemetryActor : ReceiveActor
+{
+    private readonly ISiteAuditQueue _queue;
+    private readonly ISiteStreamAuditClient _client;
+    private readonly IOperationTrackingStore? _trackingStore;
+    private readonly SiteAuditTelemetryOptions _options;
+    private readonly ILogger<SiteAuditTelemetryActor> _logger;
+    private ICancelable? _pendingTick;
+    private ICancelable? _pendingCachedTick;
+    // AuditLog-010: per-actor lifecycle CTS so an in-flight drain (queue read,
+    // gRPC push, mark-forwarded write) is actually cancelled when the actor is
+    // stopped — without it, a stuck IngestAuditEventsAsync would hold the
+    // continuation through CoordinatedShutdown's actor-system terminate window.
+    // Cancelled in PostStop; never reset (the actor is single-lifetime).
+    // The same CTS gates the cached-drain pipeline (queue read + tracking
+    // lookup + gRPC push) so both paths observe shutdown cooperatively.
+    private readonly CancellationTokenSource _lifecycleCts = new();
+
+    /// <summary>Initializes the actor with its drain queue, gRPC client, options, and logger.</summary>
+    /// <param name="queue">The site-local SQLite audit queue to drain.</param>
+    /// <param name="client">The gRPC client used to push audit events to central.</param>
+    /// <param name="options">Telemetry options controlling drain intervals and batch size.</param>
+    /// <param name="logger">Logger instance.</param>
+    /// <param name="trackingStore">
+    /// Optional site-local operation tracking store. When supplied the actor
+    /// runs the combined-telemetry cached-drain in parallel with the audit-only
+    /// drain; when null (central composition roots, tests that don't exercise
+    /// cached calls) the cached scheduler is never armed and only the
+    /// audit-only drain runs.
+    /// </param>
+    public SiteAuditTelemetryActor(
+        ISiteAuditQueue queue,
+        ISiteStreamAuditClient client,
+        IOptions<SiteAuditTelemetryOptions> options,
+        ILogger<SiteAuditTelemetryActor> logger,
+        IOperationTrackingStore? trackingStore = null)
+    {
+        ArgumentNullException.ThrowIfNull(queue);
+        ArgumentNullException.ThrowIfNull(client);
+        ArgumentNullException.ThrowIfNull(options);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _queue = queue;
+        _client = client;
+        _options = options.Value;
+        _logger = logger;
+        _trackingStore = trackingStore;
+
+        ReceiveAsync<Drain>(_ => OnDrainAsync());
+        ReceiveAsync<CachedDrain>(_ => OnCachedDrainAsync());
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        base.PreStart();
+        // Initial ticks fire on the busy interval so both drains start polling
+        // soon after host startup. A subsequent empty drain will move to the
+        // idle interval naturally.
+        ScheduleNext(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
+        if (_trackingStore is not null)
+        {
+            ScheduleNextCached(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
+        }
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _pendingTick?.Cancel();
+        _pendingCachedTick?.Cancel();
+        // AuditLog-010: cancel any in-flight drain so a stuck queue read or
+        // gRPC push does not hold the continuation past actor stop.
+        try
+        {
+            _lifecycleCts.Cancel();
+        }
+        catch (ObjectDisposedException)
+        {
+            // PostStop may run after a prior Dispose path — benign.
+        }
+        _lifecycleCts.Dispose();
+        base.PostStop();
+    }
+
+    private async Task OnDrainAsync()
+    {
+        var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
+        // AuditLog-010: route every async dependency call through the
+        // per-actor lifecycle token so PostStop cancellation actually
+        // propagates into the queue read, the gRPC push, and the
+        // mark-forwarded write. OperationCanceledException is swallowed by
+        // the catch-all below.
+        var ct = _lifecycleCts.Token;
+        try
+        {
+            var pending = await _queue.ReadPendingAsync(_options.BatchSize, ct)
+                .ConfigureAwait(false);
+            if (pending.Count == 0)
+            {
+                // No rows — settle into the idle cadence until the next write
+                // bumps us back into the busy cadence.
+                nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
+                return;
+            }
+
+            var batch = BuildBatch(pending);
+
+            IngestAck ack;
+            try
+            {
+                ack = await _client.IngestAuditEventsAsync(batch, ct)
+                    .ConfigureAwait(false);
+            }
+            catch (Exception ex)
+            {
+                // gRPC fault — leave the rows in Pending so the next drain
+                // retries. Bundle D's brief: "On gRPC exception (any), log
+                // Warning, schedule next Drain in BusyIntervalSeconds."
+                _logger.LogWarning(ex,
+                    "IngestAuditEvents push failed for {Count} pending events; will retry next drain.",
+                    pending.Count);
+                return;
+            }
+
+            var acceptedIds = ParseAcceptedIds(ack);
+            if (acceptedIds.Count > 0)
+            {
+                await _queue.MarkForwardedAsync(acceptedIds, ct)
+                    .ConfigureAwait(false);
+            }
+        }
+        catch (Exception ex)
+        {
+            // Catch-all so a SQLite hiccup or mapper bug never crashes the
+            // actor. The next tick is still scheduled in the finally block.
+            _logger.LogError(ex, "Unexpected error during audit-log telemetry drain.");
+        }
+        finally
+        {
+            // AuditLog-010: if the actor is already shutting down, do not
+            // arm another tick — the scheduler would fire after PostStop and
+            // the message would land in dead letters.
+            if (!_lifecycleCts.IsCancellationRequested)
+            {
+                ScheduleNext(nextDelay);
+            }
+        }
+    }
+
+    /// <summary>
+    /// AuditLog-001: combined-telemetry drain. Reads cached-lifecycle audit
+    /// rows, joins each with the matching <see cref="IOperationTrackingStore"/>
+    /// snapshot, builds a <see cref="CachedTelemetryBatch"/>, and pushes via
+    /// <see cref="ISiteStreamAuditClient.IngestCachedTelemetryAsync"/>. Rows
+    /// whose tracking snapshot is missing (race with retention purge / late
+    /// audit row) are logged + skipped — the operational half will be
+    /// re-emitted on the next lifecycle event, and the audit row stays
+    /// <see cref="Commons.Types.Enums.AuditForwardState.Pending"/> so a later
+    /// drain (or reconciliation pull) can revisit it.
+    /// </summary>
+    private async Task OnCachedDrainAsync()
+    {
+        var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
+        var ct = _lifecycleCts.Token;
+        try
+        {
+            // _trackingStore is non-null by construction here — the cached
+            // scheduler is only armed when it was supplied (see PreStart).
+            // Defensive check kept for clarity and to silence the compiler's
+            // null-flow analysis.
+            if (_trackingStore is null)
+            {
+                return;
+            }
+
+            var pending = await _queue
+                .ReadPendingCachedTelemetryAsync(_options.BatchSize, ct)
+                .ConfigureAwait(false);
+            if (pending.Count == 0)
+            {
+                nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
+                return;
+            }
+
+            var batch = new CachedTelemetryBatch();
+            var emittedEventIds = new List<Guid>(pending.Count);
+
+            foreach (var auditRow in pending)
+            {
+                if (auditRow.CorrelationId is null)
+                {
+                    // CorrelationId carries the TrackedOperationId for cached
+                    // rows — see CachedCallLifecycleBridge.BuildPacket. Without
+                    // it we can't look up the tracking row; log + skip so the
+                    // bad row doesn't block the rest of the batch. The audit
+                    // row stays Pending (still not in emittedEventIds) and
+                    // central reconciliation will pick it up.
+                    _logger.LogWarning(
+                        "Cached-telemetry drain: audit row {EventId} ({Kind}) has no CorrelationId; skipping.",
+                        auditRow.EventId, auditRow.Kind);
+                    continue;
+                }
+
+                TrackingStatusSnapshot? snapshot;
+                try
+                {
+                    snapshot = await _trackingStore
+                        .GetStatusAsync(new TrackedOperationId(auditRow.CorrelationId.Value), ct)
+                        .ConfigureAwait(false);
+                }
+                catch (Exception ex)
+                {
+                    // A tracking-store throw must NOT abort the rest of the
+                    // batch — the audit half is best-effort. Log and skip
+                    // this row; it stays Pending for the next drain.
+                    _logger.LogWarning(ex,
+                        "Cached-telemetry drain: tracking lookup threw for {EventId} (TrackedOperationId {Tid}); skipping.",
+                        auditRow.EventId, auditRow.CorrelationId);
+                    continue;
+                }
+
+                if (snapshot is null)
+                {
+                    // No tracking row — possible if the audit row is older
+                    // than the tracking retention window, or the tracking
+                    // store was reset. The audit half remains valid and will
+                    // be picked up by central reconciliation; skip the
+                    // combined push for this row.
+                    _logger.LogWarning(
+                        "Cached-telemetry drain: no tracking snapshot for {EventId} (TrackedOperationId {Tid}); skipping.",
+                        auditRow.EventId, auditRow.CorrelationId);
+                    continue;
+                }
+
+                var packet = BuildCachedPacket(auditRow, snapshot);
+                batch.Packets.Add(packet);
+                emittedEventIds.Add(auditRow.EventId);
+            }
+
+            if (batch.Packets.Count == 0)
+            {
+                // Every row in this read was skipped (no CorrelationId / no
+                // tracking snapshot). Leave them Pending and try again next
+                // drain — the underlying race normally resolves on its own.
+                return;
+            }
+
+            IngestAck ack;
+            try
+            {
+                ack = await _client.IngestCachedTelemetryAsync(batch, ct)
+                    .ConfigureAwait(false);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex,
+                    "IngestCachedTelemetry push failed for {Count} cached events; will retry next drain.",
+                    batch.Packets.Count);
+                return;
+            }
+
+            var acceptedIds = ParseAcceptedIds(ack);
+            if (acceptedIds.Count > 0)
+            {
+                await _queue.MarkForwardedAsync(acceptedIds, ct)
+                    .ConfigureAwait(false);
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Unexpected error during cached-telemetry drain.");
+        }
+        finally
+        {
+            if (!_lifecycleCts.IsCancellationRequested && _trackingStore is not null)
+            {
+                ScheduleNextCached(nextDelay);
+            }
+        }
+    }
+
+    private static AuditEventBatch BuildBatch(IReadOnlyList<AuditEvent> events)
+    {
+        var batch = new AuditEventBatch();
+        foreach (var e in events)
+        {
+            batch.Events.Add(AuditEventDtoMapper.ToDto(e));
+        }
+        return batch;
+    }
+
+    /// <summary>
+    /// AuditLog-001: build the combined wire packet from one cached audit row
+    /// + its matching operational tracking snapshot. The operational state
+    /// reflects the latest tracking row at emission time (not the per-event
+    /// status the audit row implies) because central's <c>SiteCalls</c>
+    /// upsert is monotonic — it never rolls back. The audit row preserves
+    /// per-event lifecycle granularity for the audit trail.
+    /// </summary>
+    private static CachedTelemetryPacket BuildCachedPacket(
+        AuditEvent auditRow, TrackingStatusSnapshot snapshot)
+    {
+        var sourceSite = auditRow.SourceSiteId ?? string.Empty;
+        // Channel string form mirrors the AuditChannel-to-string convention used
+        // by SiteCallOperational + CachedCallLifecycleBridge.BuildPacket.
+        var channelString = auditRow.Channel.ToString();
+        var target = auditRow.Target ?? snapshot.TargetSummary ?? string.Empty;
+
+        var operationalDto = new SiteCallOperationalDto
+        {
+            TrackedOperationId = snapshot.Id.Value.ToString("D"),
+            Channel = channelString,
+            Target = target,
+            SourceSite = sourceSite,
+            SourceNode = snapshot.SourceNode ?? string.Empty,
+            Status = snapshot.Status,
+            RetryCount = snapshot.RetryCount,
+            LastError = snapshot.LastError ?? string.Empty,
+            CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.CreatedAtUtc)),
+            UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.UpdatedAtUtc)),
+        };
+        if (snapshot.HttpStatus.HasValue)
+        {
+            operationalDto.HttpStatus = snapshot.HttpStatus.Value;
+        }
+        if (snapshot.TerminalAtUtc.HasValue)
+        {
+            operationalDto.TerminalAtUtc =
+                Timestamp.FromDateTime(EnsureUtc(snapshot.TerminalAtUtc.Value));
+        }
+
+        return new CachedTelemetryPacket
+        {
+            AuditEvent = AuditEventDtoMapper.ToDto(auditRow),
+            Operational = operationalDto,
+        };
+    }
+
+    private static DateTime EnsureUtc(DateTime value) =>
+        value.Kind == DateTimeKind.Utc
+            ? value
+            : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
+
+    private static IReadOnlyList<Guid> ParseAcceptedIds(IngestAck ack)
+    {
+        if (ack.AcceptedEventIds.Count == 0)
+        {
+            return Array.Empty<Guid>();
+        }
+
+        var list = new List<Guid>(ack.AcceptedEventIds.Count);
+        foreach (var raw in ack.AcceptedEventIds)
+        {
+            if (Guid.TryParse(raw, out var id))
+            {
+                list.Add(id);
+            }
+            // Malformed ids are ignored — central should never emit them, but
+            // we refuse to crash the actor over a bad string.
+        }
+        return list;
+    }
+
+    private void ScheduleNext(TimeSpan delay)
+    {
+        _pendingTick?.Cancel();
+        _pendingTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
+            delay,
+            Self,
+            Drain.Instance,
+            Self);
+    }
+
+    private void ScheduleNextCached(TimeSpan delay)
+    {
+        _pendingCachedTick?.Cancel();
+        _pendingCachedTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
+            delay,
+            Self,
+            CachedDrain.Instance,
+            Self);
+    }
+
+    /// <summary>Self-tick message that triggers an audit-only drain cycle.</summary>
+    private sealed class Drain
+    {
+        public static readonly Drain Instance = new();
+        private Drain() { }
+    }
+
+    /// <summary>
+    /// Self-tick message that triggers a combined-telemetry drain cycle.
+    /// AuditLog-001: introduced alongside the cached-drain to keep the two
+    /// paths' cadences independent — a stall on one does not block the other.
+    /// </summary>
+    private sealed class CachedDrain
+    {
+        public static readonly CachedDrain Instance = new();
+        private CachedDrain() { }
+    }
+}
@@ -0,0 +1,28 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
+
+/// <summary>
+/// Tuning knobs for the site-side <see cref="SiteAuditTelemetryActor"/> drain
+/// loop. Defaults mirror Bundle D's plan: drain every 5 s while rows are
+/// flowing (busy), every 30 s when the queue is empty (idle).
+/// </summary>
+public sealed class SiteAuditTelemetryOptions
+{
+    /// <summary>
+    /// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
+    /// rows read from the site SQLite queue and pushed in a single gRPC batch.
+    /// </summary>
+    public int BatchSize { get; set; } = 256;
+
+    /// <summary>
+    /// Delay between drains when the previous drain found at least one Pending
+    /// row OR the previous push faulted. Re-drain quickly to keep telemetry
+    /// flowing and to retry transient gRPC errors.
+    /// </summary>
+    public int BusyIntervalSeconds { get; set; } = 5;
+
+    /// <summary>
+    /// Delay between drains when the previous drain found no Pending rows.
+    /// Longer interval avoids hammering an idle SQLite + gRPC channel.
+    /// </summary>
+    public int IdleIntervalSeconds { get; set; } = 30;
+}