ScadaBridge/src/ScadaLink.AuditLog/Site/FallbackAuditWriter.cs

using Microsoft.Extensions.Logging;
using ScadaLink.AuditLog.Payload;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Interfaces.Services;

namespace ScadaLink.AuditLog.Site;

/// <summary>
/// Composes the primary <see cref="SqliteAuditWriter"/> with a drop-oldest
/// <see cref="RingBufferFallback"/>. Audit writes are best-effort by contract
/// (see <see cref="IAuditWriter"/>) — a primary failure must NEVER bubble out
/// to the calling script. Failed events are stashed in the ring; on the next
/// successful primary write the ring is drained back through the primary in
/// FIFO order.
/// </summary>
/// <remarks>
/// <para>
/// Each primary failure increments <see cref="IAuditWriteFailureCounter"/> so
/// Site Health Monitoring can surface a sustained outage as
/// <c>SiteAuditWriteFailures</c> (Bundle G).
/// </para>
/// <para>
/// Errors raised by the ring drain on recovery are logged and silently dropped
/// so we don't loop the failure mode — the trigger event itself succeeded, and
/// retrying the drain on the NEXT successful write is the recovery path.
/// </para>
/// </remarks>
public sealed class FallbackAuditWriter : IAuditWriter
{
    private readonly IAuditWriter _primary;
    private readonly RingBufferFallback _ring;
    private readonly IAuditWriteFailureCounter _failureCounter;
    private readonly ILogger<FallbackAuditWriter> _logger;
    private readonly IAuditPayloadFilter? _filter;
    private readonly SemaphoreSlim _drainGate = new(1, 1);

    /// <summary>
    /// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
    /// here so every event written via the site hot path is truncated +
    /// header/body/SQL-param redacted before it hits both the primary SQLite
    /// writer AND the ring fallback. The parameter is optional (defaults to
    /// no filtering) so the long tail of test composition roots that don't
    /// care about the filter need no change — the production
    /// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
    /// always passes the real filter through.
    /// </summary>
    public FallbackAuditWriter(
        IAuditWriter primary,
        RingBufferFallback ring,
        IAuditWriteFailureCounter failureCounter,
        ILogger<FallbackAuditWriter> logger,
        IAuditPayloadFilter? filter = null)
    {
        _primary = primary ?? throw new ArgumentNullException(nameof(primary));
        _ring = ring ?? throw new ArgumentNullException(nameof(ring));
        _failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
        _filter = filter; // null = no-op pass-through; see WriteAsync.
    }

    public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
    {
        ArgumentNullException.ThrowIfNull(evt);

        // Filter once, up-front. The filtered event flows BOTH to the primary
        // and (on failure) to the ring buffer — so a primary outage that
        // drains later still hands the SqliteAuditWriter a row that has
        // already been truncated and redacted. The filter contract is
        // "MUST NOT throw"; the null-coalesce keeps test composition roots
        // that don't wire a filter working unchanged.
        var filtered = _filter?.Apply(evt) ?? evt;

        try
        {
            await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
        }
        catch (Exception ex)
        {
            // Primary down: record the failure, stash in the ring, return
            // success to the caller. Audit-write failures NEVER abort the
            // user-facing action (alog.md §7). DO NOT attempt the ring drain
            // here — primary is throwing, draining would just scramble FIFO
            // order across re-enqueues.
            _failureCounter.Increment();
            _logger.LogWarning(ex,
                "Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
                filtered.EventId);
            // Ring stores the filtered copy so the eventual drain replays a
            // payload that has already been capped/redacted — no second
            // filter pass needed on recovery, and no risk of the ring
            // holding the raw oversized blob in memory.
            _ring.TryEnqueue(filtered);
            return;
        }

        // Primary succeeded — opportunistically drain anything that piled up
        // in the ring during the outage. Best-effort: a failure during the
        // drain re-enqueues the popped event and is logged; the next
        // successful write will retry. Drain order in the audit log is
        // therefore: <triggering event>, <backlog FIFO>.
        if (_ring.Count > 0)
        {
            await TryDrainRingAsync(ct).ConfigureAwait(false);
        }
    }

    private async Task TryDrainRingAsync(CancellationToken ct)
    {
        // Serialise drains so two concurrent recoveries don't double-replay.
        if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false))
        {
            return;
        }

        try
        {
            // Pull only what is currently buffered; do NOT wait for new events.
            // We iterate with a snapshot of Count so we never starve under
            // concurrent enqueues.
            var pending = _ring.Count;
            for (var i = 0; i < pending; i++)
            {
                if (!_ring.TryDequeue(out var queued))
                {
                    break;
                }

                try
                {
                    await _primary.WriteAsync(queued, ct).ConfigureAwait(false);
                }
                catch (Exception ex)
                {
                    // Primary fell over again. Put the event back at the head
                    // of the queue is impossible with Channel<T>; route to the
                    // tail (drop-oldest preserves the most-recent picture).
                    _failureCounter.Increment();
                    _logger.LogWarning(ex,
                        "Ring drain re-throw on EventId {EventId}; re-enqueuing.",
                        queued.EventId);
                    _ring.TryEnqueue(queued);
                    break;
                }
            }
        }
        finally
        {
            _drainGate.Release();
        }
    }
}