using Microsoft.Extensions.Logging; using ScadaLink.AuditLog.Payload; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Services; namespace ScadaLink.AuditLog.Site; /// /// Composes the primary with a drop-oldest /// . Audit writes are best-effort by contract /// (see ) — a primary failure must NEVER bubble out /// to the calling script. Failed events are stashed in the ring; on the next /// successful primary write the ring is drained back through the primary in /// FIFO order. /// /// /// /// Each primary failure increments so /// Site Health Monitoring can surface a sustained outage as /// SiteAuditWriteFailures (Bundle G). /// /// /// Errors raised by the ring drain on recovery are logged and silently dropped /// so we don't loop the failure mode — the trigger event itself succeeded, and /// retrying the drain on the NEXT successful write is the recovery path. /// /// public sealed class FallbackAuditWriter : IAuditWriter { private readonly IAuditWriter _primary; private readonly RingBufferFallback _ring; private readonly IAuditWriteFailureCounter _failureCounter; private readonly ILogger _logger; private readonly IAuditPayloadFilter? _filter; private readonly SemaphoreSlim _drainGate = new(1, 1); /// /// Bundle C (M5-T6) wires the singleton /// here so every event written via the site hot path is truncated + /// header/body/SQL-param redacted before it hits both the primary SQLite /// writer AND the ring fallback. The parameter is optional (defaults to /// no filtering) so the long tail of test composition roots that don't /// care about the filter need no change — the production /// registration /// always passes the real filter through. /// /// The primary audit writer (typically the SQLite writer). /// Drop-oldest ring buffer used to stash events when the primary fails. /// Counter incremented on each primary failure for health reporting. /// Logger for diagnostics. /// Optional payload filter applied before writing; null means no filtering. public FallbackAuditWriter( IAuditWriter primary, RingBufferFallback ring, IAuditWriteFailureCounter failureCounter, ILogger logger, IAuditPayloadFilter? filter = null) { _primary = primary ?? throw new ArgumentNullException(nameof(primary)); _ring = ring ?? throw new ArgumentNullException(nameof(ring)); _failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); _filter = filter; // null = no-op pass-through; see WriteAsync. } /// public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default) { ArgumentNullException.ThrowIfNull(evt); // Filter once, up-front. The filtered event flows BOTH to the primary // and (on failure) to the ring buffer — so a primary outage that // drains later still hands the SqliteAuditWriter a row that has // already been truncated and redacted. The filter contract is // "MUST NOT throw"; the null-coalesce keeps test composition roots // that don't wire a filter working unchanged. var filtered = _filter?.Apply(evt) ?? evt; try { await _primary.WriteAsync(filtered, ct).ConfigureAwait(false); } catch (Exception ex) { // Primary down: record the failure, stash in the ring, return // success to the caller. Audit-write failures NEVER abort the // user-facing action (alog.md §7). DO NOT attempt the ring drain // here — primary is throwing, draining would just scramble FIFO // order across re-enqueues. _failureCounter.Increment(); _logger.LogWarning(ex, "Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.", filtered.EventId); // Ring stores the filtered copy so the eventual drain replays a // payload that has already been capped/redacted — no second // filter pass needed on recovery, and no risk of the ring // holding the raw oversized blob in memory. _ring.TryEnqueue(filtered); return; } // Primary succeeded — opportunistically drain anything that piled up // in the ring during the outage. Best-effort: a failure during the // drain re-enqueues the popped event and is logged; the next // successful write will retry. Drain order in the audit log is // therefore: , . if (_ring.Count > 0) { await TryDrainRingAsync(ct).ConfigureAwait(false); } } private async Task TryDrainRingAsync(CancellationToken ct) { // Serialise drains so two concurrent recoveries don't double-replay. if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false)) { return; } try { // Pull only what is currently buffered; do NOT wait for new events. // We iterate with a snapshot of Count so we never starve under // concurrent enqueues. var pending = _ring.Count; for (var i = 0; i < pending; i++) { if (!_ring.TryDequeue(out var queued)) { break; } try { await _primary.WriteAsync(queued, ct).ConfigureAwait(false); } catch (Exception ex) { // Primary fell over again. Put the event back at the head // of the queue is impossible with Channel; route to the // tail (drop-oldest preserves the most-recent picture). _failureCounter.Increment(); _logger.LogWarning(ex, "Ring drain re-throw on EventId {EventId}; re-enqueuing.", queued.EventId); _ring.TryEnqueue(queued); break; } } } finally { _drainGate.Release(); } } }