using Microsoft.Extensions.Logging; using ZB.MOM.WW.Audit; using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; using IAuditWriter = ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.IAuditWriter; namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site; /// /// Composes the primary with a drop-oldest /// . Audit writes are best-effort by contract /// (see ) — a primary failure must NEVER bubble out /// to the calling script. Failed events are stashed in the ring; on the next /// successful primary write the ring is drained back through the primary in /// FIFO order. /// /// /// /// Each primary failure increments so /// Site Health Monitoring can surface a sustained outage as /// SiteAuditWriteFailures (Bundle G). /// /// /// Errors raised by the ring drain on recovery are logged and silently dropped /// so we don't loop the failure mode — the trigger event itself succeeded, and /// retrying the drain on the NEXT successful write is the recovery path. /// /// public sealed class FallbackAuditWriter : IAuditWriter { private readonly IAuditWriter _primary; private readonly RingBufferFallback _ring; private readonly IAuditWriteFailureCounter _failureCounter; private readonly ILogger _logger; private readonly IAuditRedactor _redactor; private readonly SemaphoreSlim _drainGate = new(1, 1); /// /// Bundle C (M5-T6) wires the singleton /// here so every event written via the site hot path is truncated + /// header/body/SQL-param redacted before it hits both the primary SQLite /// writer AND the ring fallback. The parameter is optional (defaults to /// the always-safe ) so the long /// tail of test composition roots that don't care about the redactor need /// no change — the production /// registration /// always passes the real redactor through. /// /// The primary audit writer (typically the SQLite writer). /// Drop-oldest ring buffer used to stash events when the primary fails. /// Counter incremented on each primary failure for health reporting. /// Logger for diagnostics. /// Optional canonical redactor applied before writing; null means the always-safe default. public FallbackAuditWriter( IAuditWriter primary, RingBufferFallback ring, IAuditWriteFailureCounter failureCounter, ILogger logger, IAuditRedactor? redactor = null) { _primary = primary ?? throw new ArgumentNullException(nameof(primary)); _ring = ring ?? throw new ArgumentNullException(nameof(ring)); _failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); // AuditLog-008: never default to a null redactor — over-redact instead. // C3 (Task 2.5): wired via the canonical IAuditRedactor seam. // SafeDefaultAuditRedactor performs HTTP header redaction with the // hard-coded sensitive defaults (Authorization, X-Api-Key, Cookie, // Set-Cookie) on the DetailsJson summaries so a test composition root // that doesn't bind the real options never persists those headers // verbatim. The full ScadaBridgeAuditRedactor (truncation + body / // SQL-param redaction) is wired by AddAuditLog and takes precedence. _redactor = redactor ?? SafeDefaultAuditRedactor.Instance; } /// public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default) { ArgumentNullException.ThrowIfNull(evt); // Redact once, up-front. The redacted event flows BOTH to the primary // and (on failure) to the ring buffer — so a primary outage that // drains later still hands the SqliteAuditWriter a row that has // already been truncated and redacted. The redactor contract is // "MUST NOT throw". AuditLog-008: _redactor is now non-null (defaults // to SafeDefaultAuditRedactor so header redaction is always applied // even in composition roots that don't wire the real redactor). var filtered = _redactor.Apply(evt); try { await _primary.WriteAsync(filtered, ct).ConfigureAwait(false); } catch (Exception ex) { // Primary down: record the failure, stash in the ring, return // success to the caller. Audit-write failures NEVER abort the // user-facing action (alog.md §7). DO NOT attempt the ring drain // here — primary is throwing, draining would just scramble FIFO // order across re-enqueues. _failureCounter.Increment(); _logger.LogWarning(ex, "Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.", filtered.EventId); // Ring stores the filtered copy so the eventual drain replays a // payload that has already been capped/redacted — no second // filter pass needed on recovery, and no risk of the ring // holding the raw oversized blob in memory. _ring.TryEnqueue(filtered); return; } // Primary succeeded — opportunistically drain anything that piled up // in the ring during the outage. Best-effort: a failure during the // drain re-enqueues the popped event and is logged; the next // successful write will retry. Drain order in the audit log is // therefore: , . if (_ring.Count > 0) { await TryDrainRingAsync(ct).ConfigureAwait(false); } } private async Task TryDrainRingAsync(CancellationToken ct) { // Serialise drains so two concurrent recoveries don't double-replay. if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false)) { return; } try { // Pull only what is currently buffered; do NOT wait for new events. // We iterate with a snapshot of Count so we never starve under // concurrent enqueues. var pending = _ring.Count; for (var i = 0; i < pending; i++) { if (!_ring.TryDequeue(out var queued)) { break; } try { await _primary.WriteAsync(queued, ct).ConfigureAwait(false); } catch (Exception ex) { // Primary fell over again. Put the event back at the head // of the queue is impossible with Channel; route to the // tail (drop-oldest preserves the most-recent picture). _failureCounter.Increment(); _logger.LogWarning(ex, "Ring drain re-throw on EventId {EventId}; re-enqueuing.", queued.EventId); _ring.TryEnqueue(queued); break; } } } finally { _drainGate.Release(); } } }