refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj,
namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated.
ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated.
SQL roles/logins, LDAP domains, CLI command name, and CLI config dir
(~/.scadalink → ~/.scadabridge) also renamed.

Build green; 5 Host.Tests fail awaiting SQL login rename in next commit.
Pre-existing StaleTagMonitor timing flakes unchanged.

Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
Joseph Doherty
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,81 @@
using System.Collections.Concurrent;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of
/// <see cref="IAuditCentralHealthSnapshot"/>. Owns thread-safe
/// <see cref="System.Threading.Interlocked"/> counters for
/// <c>CentralAuditWriteFailures</c> + <c>AuditRedactionFailure</c> and a
/// per-site latched stalled-state map fed by the
/// <see cref="SiteAuditTelemetryStalledTracker"/>. Also implements the
/// writer surfaces (<see cref="ICentralAuditWriteFailureCounter"/> +
/// <see cref="IAuditRedactionFailureCounter"/>) so a single concrete object
/// is the source of truth — DI binds those two interfaces to this same
/// singleton instance on the central composition root.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why one type for read + write.</b> The writer interfaces are tiny
/// (<c>Increment()</c>) and the read surface needs visibility of those
/// counters anyway — having a single class own both means the
/// <c>Interlocked</c> field IS the snapshot value, no extra plumbing needed.
/// Mirrors the
/// <see cref="ZB.MOM.WW.ScadaBridge.HealthMonitoring.SiteHealthCollector"/> pattern where
/// the collector both receives and exposes the metric.
/// </para>
/// <para>
/// <b>Stalled-state plumbing.</b> The per-site stalled latch lives directly
/// on this snapshot. <see cref="SiteAuditTelemetryStalledTracker"/> is the
/// EventStream subscriber that pushes
/// <see cref="SiteAuditTelemetryStalledChanged"/> publications in via
/// <see cref="ApplyStalled"/>. Keeping the dictionary on this type (rather
/// than reading the tracker on every access) lets the snapshot be constructed
/// without an <see cref="Akka.Actor.ActorSystem"/> dependency — the tracker
/// is wired up later from the Akka bootstrap, once the system is built.
/// </para>
/// </remarks>
public sealed class AuditCentralHealthSnapshot
: IAuditCentralHealthSnapshot,
ICentralAuditWriteFailureCounter,
IAuditRedactionFailureCounter
{
private int _centralAuditWriteFailures;
private int _auditRedactionFailure;
private readonly ConcurrentDictionary<string, bool> _stalled = new();
/// <inheritdoc/>
public int CentralAuditWriteFailures =>
Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0);
/// <inheritdoc/>
public int AuditRedactionFailure =>
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
/// <inheritdoc/>
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
new Dictionary<string, bool>(_stalled);
/// <summary>
/// Apply a <see cref="SiteAuditTelemetryStalledChanged"/> publication
/// observed by <see cref="SiteAuditTelemetryStalledTracker"/>. Public
/// so the tracker (which lives in the same assembly but is constructed
/// later from the Akka host) can push without a friend reference;
/// readers should call <see cref="SiteAuditTelemetryStalled"/>.
/// </summary>
/// <param name="evt">The event carrying the site ID and new stalled state.</param>
public void ApplyStalled(SiteAuditTelemetryStalledChanged evt)
{
if (evt is null) return;
_stalled[evt.SiteId] = evt.Stalled;
}
/// <inheritdoc/>
void ICentralAuditWriteFailureCounter.Increment() =>
Interlocked.Increment(ref _centralAuditWriteFailures);
/// <inheritdoc/>
void IAuditRedactionFailureCounter.Increment() =>
Interlocked.Increment(ref _auditRedactionFailure);
}
@@ -0,0 +1,306 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central-side singleton (per Bundle E wiring) that ingests batches of
/// <see cref="AuditEvent"/> rows pushed from sites via the
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
/// silently swallowed (first-write-wins per Bundle A's hardening).
/// </summary>
/// <remarks>
/// <para>
/// Idempotency is the contract: a row that already exists at central counts
/// as "accepted" for the purposes of the reply, because the storage state is
/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
/// </para>
/// <para>
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
/// action. The actor wraps each repository call in its own try/catch so a
/// single bad row cannot cause the rest of the batch to be lost — that
/// per-row catch is what keeps this actor alive across handler throws, not
/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
/// returns the Akka default decider (Restart for most exceptions) and
/// governs children only; this actor has no children today, so the override
/// is a forward-compat placeholder.
/// </para>
/// <para>
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
/// end to end), while Bundle E's host wiring registers the actor as a cluster
/// singleton and must therefore resolve the repository — which is a scoped EF
/// Core service — from a fresh DI scope per message. Mirroring the Notification
/// Outbox actor's pattern.
/// </para>
/// </remarks>
public class AuditLogIngestActor : ReceiveActor
{
private readonly IServiceProvider? _serviceProvider;
private readonly IAuditLogRepository? _injectedRepository;
private readonly ILogger<AuditLogIngestActor> _logger;
/// <summary>
/// Test-mode constructor — injects a concrete repository instance whose
/// lifetime exceeds the test, so the actor reuses the same instance across
/// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
/// </summary>
/// <param name="repository">Audit log repository instance shared across all messages.</param>
/// <param name="logger">Logger for ingest diagnostics.</param>
public AuditLogIngestActor(
IAuditLogRepository repository,
ILogger<AuditLogIngestActor> logger)
{
ArgumentNullException.ThrowIfNull(repository);
ArgumentNullException.ThrowIfNull(logger);
_injectedRepository = repository;
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
// The single-repository test ctor cannot service the M3 dual-write —
// it has no SiteCalls repo and no DbContext. The handler still
// registers (so callers don't dead-letter) but replies empty so the
// test surface stays explicit about what this ctor supports.
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
}
/// <summary>
/// Production constructor — resolves <see cref="IAuditLogRepository"/> from
/// a fresh DI scope per message because the repository is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The actor itself
/// is a long-lived cluster singleton, so it cannot hold a scope across
/// messages.
/// </summary>
/// <param name="serviceProvider">Root service provider used to open a fresh scope per message.</param>
/// <param name="logger">Logger for ingest diagnostics.</param>
public AuditLogIngestActor(
IServiceProvider serviceProvider,
ILogger<AuditLogIngestActor> logger)
{
ArgumentNullException.ThrowIfNull(serviceProvider);
ArgumentNullException.ThrowIfNull(logger);
_serviceProvider = serviceProvider;
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
Akka.Actor.SupervisorStrategy.DefaultDecider);
}
private async Task OnIngestAsync(IngestAuditEventsCommand cmd)
{
// Sender is captured before the first await — Akka resets Sender
// between message dispatches, so a post-await Tell would go to
// DeadLetters.
var replyTo = Sender;
var nowUtc = DateTime.UtcNow;
var accepted = new List<Guid>(cmd.Events.Count);
// Resolve the repository for the whole batch — one DbContext per
// message, mirroring NotificationOutboxActor. The injected-repository
// mode (Bundle D tests) skips the scope entirely.
// Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
// per-message scope when one is available so the row is truncated +
// redacted before InsertIfNotExistsAsync. The single-repository test
// ctor has no service provider — it falls through with no filter,
// which preserves the small-payload assumptions baked into the
// existing D2 fixtures.
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
if (_injectedRepository is not null)
{
await IngestWithRepositoryAsync(_injectedRepository, filter: null, failureCounter: null, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
else
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
// M6 Bundle E (T8): central health counter is best-effort —
// unregistered (test composition roots) means the per-row catch
// simply logs without surfacing on the health dashboard.
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
await IngestWithRepositoryAsync(repository, filter, failureCounter, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
replyTo.Tell(new IngestAuditEventsReply(accepted));
}
private async Task IngestWithRepositoryAsync(
IAuditLogRepository repository,
IAuditPayloadFilter? filter,
ICentralAuditWriteFailureCounter? failureCounter,
IngestAuditEventsCommand cmd,
DateTime nowUtc,
List<Guid> accepted)
{
foreach (var evt in cmd.Events)
{
try
{
// Stamp IngestedAtUtc here, not at the site. Bundle A's
// repository hardening already swallows duplicate-key races,
// so the same id arriving twice (site retry, reconciliation)
// is a silent no-op.
// Filter BEFORE the IngestedAtUtc stamp so the redacted
// copy carries the central-side ingest timestamp. Filter
// is contract-bound to never throw. AuditLog-008: a null
// filter (test composition root, no IAuditPayloadFilter
// registered) now falls back to the SafeDefault rather than
// pass-through, so HTTP header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filtered = safeFilter.Apply(evt);
var ingested = filtered with { IngestedAtUtc = nowUtc };
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
accepted.Add(evt.EventId);
}
catch (Exception ex)
{
// Per-row catch — one bad row never sinks the whole batch.
// The row stays Pending at the site; the next drain retries.
// M6 Bundle E (T8): bump the central health counter so a
// sustained insert-throw failure surfaces on the dashboard.
try { failureCounter?.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogError(ex,
"Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
evt.EventId);
}
}
}
/// <summary>
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
/// or both roll back, so the audit and operational mirrors never drift
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
/// downstream join lines up cleanly.
/// </summary>
/// <remarks>
/// Per-entry isolation — one entry's failed transaction does NOT abort
/// other entries in the batch (each gets its own
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
/// user-facing action — the site keeps the row Pending and retries on the
/// next drain.
/// </remarks>
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
{
var replyTo = Sender;
var accepted = new List<Guid>(cmd.Entries.Count);
try
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
// Bundle C (M5-T6): resolve the filter for the whole batch from
// the scope; null = pass-through for test composition roots that
// skip the filter registration. The filter is contract-bound to
// never throw, so we can apply it inside the per-entry try
// without risking an unbounded blast radius.
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
// M6 Bundle E (T8): same best-effort central health counter as
// the OnIngestAsync path — null on test composition roots that
// skip the registration.
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
foreach (var entry in cmd.Entries)
{
try
{
await using var tx = await dbContext.Database
.BeginTransactionAsync()
.ConfigureAwait(false);
// Stamp IngestedAtUtc on both rows from a single
// central-side instant so a join on the two tables sees
// matching timestamps (debugging convenience, not a
// correctness invariant).
var ingestedAt = DateTime.UtcNow;
// Filter the audit half BEFORE the dual-write — only the
// AuditLog row's payload columns are filterable; SiteCalls
// carries operational state only (status, retry count) and
// is left untouched. AuditLog-008: null filter falls back
// to SafeDefault so header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filteredAudit = safeFilter.Apply(entry.Audit);
var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
await auditRepo.InsertIfNotExistsAsync(auditStamped)
.ConfigureAwait(false);
await siteCallRepo.UpsertAsync(siteCallStamped)
.ConfigureAwait(false);
await tx.CommitAsync().ConfigureAwait(false);
accepted.Add(entry.Audit.EventId);
}
catch (Exception ex)
{
// Both rows rolled back via the disposing transaction. The
// EventId is NOT added to `accepted` so the site keeps its
// row Pending and retries on the next drain. Other entries
// in the batch continue with their own transactions.
// M6 Bundle E (T8): bump the central health counter so a
// sustained dual-write failure surfaces on the dashboard.
try { failureCounter?.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogError(
ex,
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
entry.Audit.EventId,
entry.SiteCall.TrackedOperationId);
}
}
}
catch (Exception ex)
{
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
// reply with whatever we managed to accept (likely empty) — the
// central singleton MUST stay alive.
_logger.LogError(
ex,
"Combined telemetry batch ingest failed before per-entry processing.");
}
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
}
/// <summary>
/// Fallback handler installed on the single-repository test ctor — that
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
/// it cannot service the dual-write. Logs a warning and replies with an
/// empty ack so callers fall through to their retry path.
/// </summary>
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
{
_logger.LogWarning(
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
cmd.Entries.Count);
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
return Task.CompletedTask;
}
}
@@ -0,0 +1,37 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
/// Defaults: once every 24 hours, keep at least one future monthly
/// boundary ahead of <see cref="DateTime.UtcNow"/>.
/// </summary>
/// <remarks>
/// <para>
/// The hosted service drives a daily roll-forward of
/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
/// SPLITs new monthly boundaries until at least
/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
/// default is intentionally conservative — anything less risks an
/// end-of-month race where inserts land in the unbounded tail partition;
/// anything more wastes nothing but represents premature commitment.
/// </para>
/// <para>
/// The 24-hour cadence is the cheapest interval that still guarantees
/// at-most-one missed boundary in steady state (even a hard failover the
/// hosted service can recover on its very next tick). Lowering this below
/// an hour would generate more metadata churn than it saves.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceOptions
{
/// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
public int IntervalSeconds { get; set; } = 86_400;
/// <summary>
/// Minimum number of future months that <c>pf_AuditLog_Month</c> must
/// cover after each tick. Default 1 — i.e. as of mid-May the partition
/// for the next full month (June) must already be present.
/// </summary>
public int LookaheadMonths { get; set; } = 1;
}
@@ -0,0 +1,151 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
/// missing future boundaries — the partition function must always cover at
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
/// future months, otherwise inserts past the highest boundary accumulate in
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
/// cannot purge cleanly.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not an actor.</b> Bundle C's
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
/// because it needs supervised lifecycle alongside the rest of the
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
/// chore with no cross-actor coordination, so we use the much simpler
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
/// between ticks, cancellation on stop. Reusing
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
/// keeps the contract testable without any actor framework involvement.
/// </para>
/// <para>
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
/// a try/catch so a transient SQL Server error never tears down the hosted
/// service — the next tick simply retries. The exception is logged with
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
/// E's central health collector) can subscribe to the logger to alert on
/// repeated failures.
/// </para>
/// <para>
/// <b>Startup ordering.</b> A first tick fires immediately at
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
/// the partition function to come up to spec. This is also what the brief
/// asks for ("Run once on startup").
/// </para>
/// <para>
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
/// (alongside the rest of the EF repositories) because the implementation
/// reuses the per-scope <c>ScadaBridgeDbContext</c>. A hosted service is a
/// singleton, so it must open and dispose a scope around each tick — the
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
private CancellationTokenSource? _cts;
private Task? _loop;
/// <summary>
/// Initializes the maintenance service with its required dependencies.
/// </summary>
/// <param name="scopeFactory">Scope factory used to open DI scopes for each maintenance run.</param>
/// <param name="options">Partition maintenance options (retention period, purge interval, etc.).</param>
/// <param name="logger">Logger for this service.</param>
public AuditLogPartitionMaintenanceService(
IServiceScopeFactory scopeFactory,
IOptions<AuditLogPartitionMaintenanceOptions> options,
ILogger<AuditLogPartitionMaintenanceService> logger)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// Run once on startup so a fresh deployment isn't gated on the
// IntervalSeconds initial wait — the brief calls this out explicitly.
await SafeMaintainAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeMaintainAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeMaintainAsync(CancellationToken ct)
{
try
{
await using var scope = _scopeFactory.CreateAsyncScope();
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
var added = await maintenance
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
.ConfigureAwait(false);
if (added.Count > 0)
{
_logger.LogInformation(
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
added.Count,
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
}
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of tick failure (transient SQL, DI resolution, etc.) so
// the next tick gets a chance. The brief's contract is
// "exception logged, not propagated".
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}
@@ -0,0 +1,213 @@
using System.Diagnostics;
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
/// purge. On a configurable timer (default 24 hours) the actor:
/// <list type="number">
/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
/// for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
/// than <c>DateTime.UtcNow - RetentionDays</c>.</item>
/// <item>For each eligible boundary, calls
/// <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
/// the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
/// EventStream so the Bundle E central health collector + ops surfaces
/// can subscribe without coupling to this actor.</item>
/// </list>
/// </summary>
/// <remarks>
/// <para>
/// <b>Daily cadence.</b> Partition switch is metadata-only but the
/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
/// more often than necessary trades unique-index rebuild outages for
/// negligible freshness wins. The default 24-hour interval matches
/// alog.md §10's retention policy.
/// </para>
/// <para>
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
/// failure, contention with backup, missing object) must NOT prevent the
/// other eligible boundaries from being purged on the same tick. Per-boundary
/// work runs inside its own try/catch — that per-boundary catch is what
/// keeps the singleton alive across handler throws. The
/// <see cref="SupervisorStrategy"/> override returns the Akka default
/// decider (Restart) and governs children only; this actor has no children
/// today, so the override is a forward-compat placeholder.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The singleton
/// opens one DI scope per tick and reuses the same repository across every
/// boundary in that tick — mirrors the
/// <see cref="SiteAuditReconciliationActor"/> pattern.
/// </para>
/// <para>
/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
/// the EventStream rather than direct messaging avoids coupling this actor
/// to its consumers; M6 Bundle E will subscribe a central health-counter
/// bridge that surfaces purge progress on the central health report.
/// </para>
/// </remarks>
public class AuditLogPurgeActor : ReceiveActor
{
private readonly IServiceProvider _services;
private readonly AuditLogPurgeOptions _purgeOptions;
private readonly AuditLogOptions _auditOptions;
private readonly ILogger<AuditLogPurgeActor> _logger;
private ICancelable? _timer;
/// <summary>Initializes a new instance of <see cref="AuditLogPurgeActor"/> and registers the tick handler.</summary>
/// <param name="services">DI service provider used to create scoped repository instances per tick.</param>
/// <param name="purgeOptions">Options controlling the purge interval.</param>
/// <param name="auditOptions">Options controlling retention policy (RetentionDays).</param>
/// <param name="logger">Logger instance.</param>
public AuditLogPurgeActor(
IServiceProvider services,
IOptions<AuditLogPurgeOptions> purgeOptions,
IOptions<AuditLogOptions> auditOptions,
ILogger<AuditLogPurgeActor> logger)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(purgeOptions);
ArgumentNullException.ThrowIfNull(auditOptions);
ArgumentNullException.ThrowIfNull(logger);
_services = services;
_purgeOptions = purgeOptions.Value;
_auditOptions = auditOptions.Value;
_logger = logger;
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
var interval = _purgeOptions.Interval;
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
initialDelay: interval,
interval: interval,
receiver: Self,
message: PurgeTick.Instance,
sender: Self);
}
/// <inheritdoc />
protected override void PostStop()
{
_timer?.Cancel();
base.PostStop();
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: 0,
withinTimeRange: TimeSpan.Zero,
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
}
private async Task OnTickAsync()
{
// Capture EventStream BEFORE the first await. Accessing Context (and
// therefore Context.System) after an await is unsafe because Akka's
// ActorBase.Context throws "no active ActorContext" once the
// continuation runs on a thread that isn't currently dispatching this
// actor — mirrors the same Sender-capture pattern in
// AuditLogIngestActor.OnIngestAsync.
var eventStream = Context.System.EventStream;
// Compute the retention threshold from AuditLogOptions.RetentionDays
// each tick — the options class supports hot reload via
// IOptionsMonitor for the redaction policy and similar settings; we
// read the snapshot per-tick so an operator who lowers RetentionDays
// sees the change applied on the next purge without an actor
// restart.
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
await using var scope = _services.CreateAsyncScope();
IAuditLogRepository repository;
try
{
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
return;
}
IReadOnlyList<DateTime> boundaries;
try
{
boundaries = await repository
.GetPartitionBoundariesOlderThanAsync(threshold)
.ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(
ex,
"Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
threshold);
return;
}
if (boundaries.Count == 0)
{
return;
}
foreach (var boundary in boundaries)
{
// Per-boundary try/catch: one bad partition (transient SQL
// failure, missing object, contention with backup) does NOT
// abandon the rest of the tick.
var sw = Stopwatch.StartNew();
try
{
var rowsDeleted = await repository
.SwitchOutPartitionAsync(boundary)
.ConfigureAwait(false);
sw.Stop();
eventStream.Publish(
new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
_logger.LogInformation(
"Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
boundary,
rowsDeleted,
sw.ElapsedMilliseconds);
}
catch (Exception ex)
{
sw.Stop();
_logger.LogError(
ex,
"Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
boundary,
sw.ElapsedMilliseconds);
}
}
}
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
internal sealed class PurgeTick
{
public static readonly PurgeTick Instance = new();
private PurgeTick() { }
}
}
@@ -0,0 +1,43 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central <see cref="AuditLogPurgeActor"/> singleton.
/// Default cadence is 24 hours per the M6 plan; the retention window itself
/// is sourced from <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.RetentionDays"/>
/// (default 365) so operators tune retention from a single section.
/// </summary>
/// <remarks>
/// <para>
/// The purge actor is a daily-cadence singleton, not a hot-loop, because
/// partition-switch I/O is metadata-only but the drop-and-rebuild dance
/// briefly removes the <c>UX_AuditLog_EventId</c> unique index — running
/// more often than necessary trades index-rebuild outages for marginal
/// freshness gains. Lower this only when an operator can prove they need
/// sub-daily purge granularity.
/// </para>
/// <para>
/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
/// milliseconds without polluting the production config surface; production
/// binds <see cref="IntervalHours"/> only.
/// </para>
/// </remarks>
public sealed class AuditLogPurgeOptions
{
/// <summary>Period of the purge tick in hours (default 24).</summary>
public int IntervalHours { get; set; } = 24;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-hour resolution allows. When non-null, takes precedence over
/// <see cref="IntervalHours"/>. Not bound from config — production
/// config exposes <see cref="IntervalHours"/> only.
/// </summary>
public TimeSpan? IntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override
/// when set. Falls back to <see cref="IntervalHours"/>.
/// </summary>
public TimeSpan Interval =>
IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
}
@@ -0,0 +1,29 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Published on the actor-system EventStream by <see cref="AuditLogPurgeActor"/>
/// after each successful partition switch-out. Downstream consumers (Bundle E
/// central health collector, ops dashboards, audit trails) subscribe so a
/// purge action is observable without the actor needing to know about any
/// specific subscriber.
/// </summary>
/// <param name="MonthBoundary">
/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e.
/// the first instant of the purged month in UTC.
/// </param>
/// <param name="RowsDeleted">
/// Approximate row count purged from the partition, sampled BEFORE the
/// switch. Exact accounting would require a post-switch scan of the staging
/// table, which the dance drops immediately, so this is the closest
/// observable proxy. Zero is a valid value when the actor's enumerator
/// included a partition the operator subsequently emptied by hand.
/// </param>
/// <param name="DurationMs">
/// Wall-clock time spent inside <c>SwitchOutPartitionAsync</c> for this
/// boundary, in milliseconds. Useful for spotting the rare slow purge
/// without spinning up dedicated telemetry.
/// </param>
public sealed record AuditLogPurgedEvent(
DateTime MonthBoundary,
long RowsDeleted,
long DurationMs);
@@ -0,0 +1,61 @@
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T9) — bridges
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
/// parameter redactor stage throws and the filter has to over-redact the
/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
/// failure surfaces on the central health surface as
/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Site vs central.</b> M5 Bundle C wired the SITE-side bridge
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>),
/// which routes increments into the site health report payload's
/// <c>AuditRedactionFailure</c> field. That handles redactor failures on the
/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the
/// MIRROR bridge here so the same payload filter — when it runs on the
/// central <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/> paths — surfaces its failures on the
/// central dashboard rather than disappearing into a NoOp.
/// </para>
/// <para>
/// <b>Registration shape.</b> Site composition roots call
/// <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>,
/// which overrides the binding with the site bridge. Central composition
/// roots call <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>,
/// which overrides with this central bridge. A node never wears both hats —
/// site and central are distinct host roles — so the two bridges never
/// fight over the same binding at runtime.
/// </para>
/// <para>
/// <b>Why not a thin wrapper around the snapshot directly?</b> The snapshot
/// itself <i>could</i> be the bound implementation (it already implements
/// <see cref="IAuditRedactionFailureCounter"/>), but a dedicated class makes
/// the central-vs-site asymmetry explicit at the DI boundary — readers of
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>
/// see "site → site bridge, central → central bridge", matching the
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>
/// shape one-for-one.
/// </para>
/// </remarks>
public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter
{
private readonly AuditCentralHealthSnapshot _snapshot;
/// <summary>
/// Initializes a new <see cref="CentralAuditRedactionFailureCounter"/> backed by the supplied snapshot.
/// </summary>
/// <param name="snapshot">The central health snapshot that accumulates the redaction failure count.</param>
public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot)
{
_snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
}
/// <inheritdoc/>
public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment();
}
@@ -0,0 +1,159 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central-only direct-write implementation of <see cref="ICentralAuditWriter"/>.
/// Wraps <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> as a best-effort
/// audit emission path for components that originate audit events ON the central
/// node (Notification Outbox dispatch, Inbound API) — NOT for site telemetry
/// ingest (that path is the SiteAudit → AuditLogIngestActor batched flow).
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort contract.</b> Audit-write failures NEVER abort the user-facing
/// action (alog.md §13). The writer catches every exception thrown by repository
/// resolution or the insert call, logs at warning, and returns successfully.
/// Callers may still wrap the call in their own try/catch (defensive — the writer
/// is supposed to swallow).
/// </para>
/// <para>
/// <b>Scope-per-call resolution.</b> <see cref="IAuditLogRepository"/> is a SCOPED
/// EF Core service (registered by <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase</c>). The
/// writer itself is registered as a singleton (so all callers share one instance),
/// so it cannot hold a scope across calls — it opens a fresh
/// <see cref="IServiceScope"/> per <see cref="WriteAsync"/> invocation, mirroring
/// the per-message scope pattern used by <c>AuditLogIngestActor</c> and
/// <c>NotificationOutboxActor</c>.
/// </para>
/// <para>
/// <b>Idempotency.</b> Persistence is via <c>InsertIfNotExistsAsync</c>, so a
/// double-emitted event (same <see cref="AuditEvent.EventId"/>) is a silent
/// no-op — the writer is safe to call from any number of dispatch paths.
/// </para>
/// </remarks>
public sealed class CentralAuditWriter : ICentralAuditWriter
{
private readonly IServiceProvider _services;
private readonly ILogger<CentralAuditWriter> _logger;
private readonly IAuditPayloadFilter _filter;
private readonly ICentralAuditWriteFailureCounter _failureCounter;
private readonly INodeIdentityProvider? _nodeIdentity;
/// <summary>
/// Bundle C (M5-T6) — the central direct-write path used by the
/// NotificationOutboxActor dispatch and the Inbound API middleware also
/// needs to truncate + redact before the row hits MS SQL. The filter is
/// optional so the M4 test composition roots that don't pass one keep
/// working (they only ever write small payloads); production DI registers
/// the real filter via <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
/// M6 Bundle E (T8) — adds the optional
/// <see cref="ICentralAuditWriteFailureCounter"/> so a swallowed repository
/// throw bumps the central health surface's
/// <c>CentralAuditWriteFailures</c> counter. Defaults to a NoOp so test
/// composition roots that don't wire the counter keep their current
/// behaviour. SourceNode-stamping (Task 12) — adds the optional
/// <see cref="INodeIdentityProvider"/> so central-origin rows (Notification
/// Outbox dispatch, Inbound API) carry the writing central node's
/// identifier when the caller hasn't already supplied one. Optional /
/// defaulting-to-null so M4 test composition roots that don't pass a
/// provider keep working — the caller-wins discipline means an absent
/// provider simply leaves SourceNode at whatever the caller set (often
/// null, which is the legacy behaviour).
/// </summary>
/// <param name="services">Service provider used to open a per-call scope for the scoped repository.</param>
/// <param name="logger">Logger for swallowed write-failure diagnostics.</param>
/// <param name="filter">Optional payload filter for truncation and redaction; defaults to a pass-through.</param>
/// <param name="failureCounter">Optional counter incremented on swallowed repository failures; defaults to a no-op.</param>
/// <param name="nodeIdentity">Optional node identity provider for stamping <c>SourceNode</c> on central-origin rows.</param>
public CentralAuditWriter(
IServiceProvider services,
ILogger<CentralAuditWriter> logger,
IAuditPayloadFilter? filter = null,
ICentralAuditWriteFailureCounter? failureCounter = null,
INodeIdentityProvider? nodeIdentity = null)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
// AuditLog-008: never default to null — over-redact instead.
// SafeDefaultAuditPayloadFilter applies HTTP header redaction with
// hard-coded sensitive defaults so a composition root that omits the
// real filter still scrubs Authorization / X-Api-Key / Cookie /
// Set-Cookie before persistence.
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
_failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
_nodeIdentity = nodeIdentity;
}
/// <inheritdoc />
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
if (evt is null)
{
// Defensive — a null event is a programming bug at the caller and
// produces no meaningful audit row. Log and return.
_logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
return;
}
try
{
// Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
// filter contract is "never throws". AuditLog-008: _filter is now
// non-null (SafeDefaultAuditPayloadFilter fallback) so header
// redaction always runs even in composition roots that omit the
// real filter.
var filtered = _filter.Apply(evt);
// SourceNode-stamping (Task 12): caller-provided value wins
// (supports any future direct-write callsite that already has its
// own node id); otherwise stamp from the local
// INodeIdentityProvider, when one is wired. Production DI on
// central nodes always supplies the provider; legacy test
// composition roots that don't pass it leave SourceNode at
// whatever the caller set (often null), preserving back-compat.
if (filtered.SourceNode is null && _nodeIdentity?.NodeName is { } nodeName)
{
filtered = filtered with { SourceNode = nodeName };
}
await using var scope = _services.CreateAsyncScope();
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
// Audit failure NEVER aborts the user-facing action — swallow and log.
// M6 Bundle E (T8): also surface the failure on the central health
// counter so a sustained audit-write outage is visible on the
// health dashboard rather than disappearing into the log file.
try
{
_failureCounter.Increment();
}
catch
{
// Counter must NEVER throw — defence in depth. Even if a
// misbehaving custom counter does, swallowing here keeps the
// best-effort contract intact.
}
// Log the input event's identifying fields. These three (EventId,
// Kind, Status) are immutable across the filter+stamp chain — the
// `with` clones above touch only SourceNode and IngestedAtUtc — so
// referencing `evt` here is intentional and equivalent to the
// stamped record for diagnostics. If you add a field here that the
// stamp chain DOES mutate (e.g., SourceNode), reference the latest
// post-stamp record name instead, not `evt`.
_logger.LogWarning(
ex,
"CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
evt.EventId, evt.Kind, evt.Status);
}
}
}
@@ -0,0 +1,62 @@
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side
/// audit-health counters: <see cref="CentralAuditWriteFailures"/> (every
/// repository insert throw from <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/>), <see cref="AuditRedactionFailure"/>
/// (every payload-filter redactor throw on the central path), and
/// <see cref="SiteAuditTelemetryStalled"/> (per-site latched state from the
/// <see cref="SiteAuditTelemetryStalledTracker"/>).
/// </summary>
/// <remarks>
/// <para>
/// <b>Read-only contract.</b> Implementations expose a point-in-time snapshot
/// — increments and tracker updates happen through the dedicated counter /
/// tracker interfaces, not through this surface. Consumers (M7+ central
/// health pages) read these properties; they never mutate.
/// </para>
/// <para>
/// <b>Why a parallel surface from <see cref="ICentralHealthAggregator"/>.</b>
/// <see cref="ICentralHealthAggregator"/> aggregates per-site
/// <c>SiteHealthState</c> reports the SITE emits. The central audit-write
/// failure / redaction-failure counters originate ON central (no site report
/// carries them), so they live on a dedicated snapshot rather than being
/// retro-fitted into a per-site state. The two surfaces will be composed at
/// the M7 dashboard layer.
/// </para>
/// </remarks>
public interface IAuditCentralHealthSnapshot
{
/// <summary>
/// Count of central-side audit-write failures since process start.
/// Incremented by every <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/> repository insert that throws.
/// </summary>
int CentralAuditWriteFailures { get; }
/// <summary>
/// Count of central-side payload-filter redactor over-redactions since
/// process start. Incremented by every header / body / SQL-parameter
/// redactor stage that throws (the filter falls back to the
/// <c>&lt;redacted: redactor error&gt;</c> marker and never aborts the
/// user-facing action). Sites have their own counter
/// (<see cref="IAuditRedactionFailureCounter"/>-backed
/// <c>SiteHealthReport.AuditRedactionFailure</c>) and the central
/// composition root's binding routes ALL central redactor throws
/// (CentralAuditWriter + AuditLogIngestActor paths) into this counter.
/// </summary>
int AuditRedactionFailure { get; }
/// <summary>
/// Per-site latched stalled state: <c>true</c> when the
/// <see cref="SiteAuditReconciliationActor"/> has observed two
/// consecutive non-draining cycles for that site, <c>false</c> after the
/// first draining cycle. Sites absent from the map are interpreted as
/// healthy (<c>Stalled=false</c> default). Snapshot is a defensive
/// copy — readers must not mutate.
/// </summary>
IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled { get; }
}
@@ -0,0 +1,23 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit
/// writers (<see cref="CentralAuditWriter"/>, <see cref="AuditLogIngestActor"/>)
/// every time a repository <c>InsertIfNotExistsAsync</c> throws. Mirrors the
/// site-side <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.IAuditWriteFailureCounter"/>
/// shape one-for-one — same one-method contract, same NoOp default, same
/// must-never-abort-the-user-facing-action invariant.
/// </summary>
/// <remarks>
/// Audit-write failures NEVER abort the user-facing action (alog.md §13) —
/// the writer swallows the exception and surfaces the failure via this counter
/// instead. A NoOp default is the correct safe fallback while the central
/// health surface is being wired in; <see cref="AuditCentralHealthSnapshot"/>
/// is the production binding that routes increments into the aggregated
/// central health snapshot consumed by future M7+ pages.
/// </remarks>
public interface ICentralAuditWriteFailureCounter
{
/// <summary>Increment the central audit-write failure counter by one.</summary>
void Increment();
}
@@ -0,0 +1,49 @@
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Mockable abstraction over the central-side <c>PullAuditEvents</c> gRPC
/// client surface that <see cref="SiteAuditReconciliationActor"/> uses to
/// fetch the next reconciliation batch from a specific site. Extracted so the
/// actor can be unit-tested against an in-memory stub without standing up a
/// real <c>GrpcChannel</c> per site.
/// </summary>
/// <remarks>
/// <para>
/// The production implementation (host wiring task) wraps the auto-generated
/// <c>SiteStreamService.SiteStreamServiceClient</c>, multiplexing one
/// <c>GrpcChannel</c> per site keyed on
/// <see cref="SiteEntry.GrpcEndpoint"/>. Until that wiring lands the DI
/// composition root binds a NoOp default that returns an empty response — the
/// reconciliation tick is still scheduled and the cursor logic still runs, so
/// regressions in the actor itself are caught even before the real client
/// arrives.
/// </para>
/// <para>
/// Implementations MUST NOT throw on transport faults that the actor can
/// tolerate (connection refused, deadline exceeded). The actor's contract is
/// "one site's failure doesn't sink the rest of the tick"; an exception still
/// won't crash the actor (the per-site try/catch catches it), but returning
/// an empty response on a known-recoverable error keeps the logs cleaner.
/// </para>
/// </remarks>
public interface IPullAuditEventsClient
{
/// <summary>
/// Issues a <c>PullAuditEvents</c> RPC against the site whose endpoint
/// is registered against <paramref name="siteId"/>. Returns the next
/// batch of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
/// rows ordered oldest-first AND a <c>MoreAvailable</c> flag the actor
/// uses to decide whether to fire another pull immediately.
/// </summary>
/// <param name="siteId">The identifier of the site to pull audit events from.</param>
/// <param name="sinceUtc">Only events with an <c>OccurredAtUtc</c> at or after this cursor time are returned.</param>
/// <param name="batchSize">Maximum number of events to return per call.</param>
/// <param name="ct">Cancellation token.</param>
Task<PullAuditEventsResponse> PullAsync(
string siteId,
DateTime sinceUtc,
int batchSize,
CancellationToken ct);
}
@@ -0,0 +1,35 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Enumeration surface consumed by <see cref="SiteAuditReconciliationActor"/> to
/// discover which sites to poll on each reconciliation tick. Extracted so the
/// actor can be unit-tested against a static list without depending on the
/// production <c>ISiteRepository</c> + EF Core DbContext.
/// </summary>
/// <remarks>
/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
/// site's configured <c>GrpcNodeAAddress</c> (falling back to
/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
/// configured are silently skipped — the reconciliation pull cannot reach
/// them, but absence of an address is a configuration decision, not a runtime
/// error.
/// </remarks>
public interface ISiteEnumerator
{
/// <summary>
/// Returns the current set of sites the reconciliation puller should visit
/// on the next tick. Implementations should reflect adds/removes promptly
/// — the actor calls this once per tick.
/// </summary>
/// <param name="ct">Cancellation token for the async enumeration.</param>
Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
}
/// <summary>
/// One reconciliation target: the site identifier the actor uses as the
/// cursor key and the gRPC endpoint <see cref="IPullAuditEventsClient"/> dials
/// to issue the pull. Endpoint is the bare authority (e.g. <c>http://siteA:8083</c>);
/// transport selection (TLS, keepalive, etc.) is the client's concern.
/// </summary>
public sealed record SiteEntry(string SiteId, string GrpcEndpoint);
@@ -0,0 +1,17 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Default <see cref="ICentralAuditWriteFailureCounter"/> binding used when
/// the central health surface (<see cref="AuditCentralHealthSnapshot"/>) has
/// not been wired (test composition roots, site-only hosts that incidentally
/// resolve a <see cref="CentralAuditWriter"/>). Drops every increment on the
/// floor. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.NoOpAuditWriteFailureCounter"/>.
/// </summary>
public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter
{
/// <inheritdoc/>
public void Increment()
{
// intentional no-op
}
}
@@ -0,0 +1,387 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
/// known site, asks the site for any <see cref="AuditEvent"/> rows with
/// <see cref="AuditEvent.OccurredAtUtc"/> &gt;= the site's last reconciled
/// cursor, ingests them idempotently into the central
/// <see cref="IAuditLogRepository"/>, and advances the cursor.
/// </summary>
/// <remarks>
/// <para>
/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
/// missed push (gRPC blip, central restart, site offline) is eventually
/// repaired by central re-pulling whatever the site still has in
/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
/// arrivals from both paths a silent no-op.
/// </para>
/// <para>
/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
/// kept in-memory for the actor's lifetime. The cluster singleton normally
/// survives the host process; on a deliberate failover OR a singleton restart
/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
/// but correct — the next tick simply asks for everything the site still has,
/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
/// considered and rejected for M6: the cost of a write per tick outweighs the
/// rare benefit of avoiding one over-broad pull after a restart.
/// </para>
/// <para>
/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
/// — i.e. the backlog isn't draining. The actor publishes
/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
/// can flip the health metric without coupling this actor to the health
/// collection surface today.
/// </para>
/// <para>
/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
/// repository write) must NOT prevent other sites from being polled on the
/// same tick. The per-site work runs inside its own try/catch — that
/// per-site catch is what keeps the actor running across handler throws.
/// The <see cref="SupervisorStrategy"/> override returns
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart
/// semantics) and governs children only; this actor has no children today,
/// so the override is a forward-compat placeholder. If it ever did fire,
/// restart would reset the in-memory cursors — but as noted above that's
/// a safe (over-pull, idempotent) recovery.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
/// opens one DI scope per tick and reuses the same repository across all
/// sites in that tick — one DbContext per tick mirrors the
/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
/// </para>
/// </remarks>
public class SiteAuditReconciliationActor : ReceiveActor
{
private readonly ISiteEnumerator _sites;
private readonly IPullAuditEventsClient _client;
private readonly IServiceProvider _services;
private readonly SiteAuditReconciliationOptions _options;
private readonly ILogger<SiteAuditReconciliationActor> _logger;
/// <summary>
/// Per-site reconciliation watermark — the highest
/// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
/// tick. Asking for <c>OccurredAtUtc &gt;= cursor</c> rather than &gt;
/// is the site contract (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
/// duplicate-with-same-timestamp rows are filtered out by the idempotent
/// repository write.
/// </summary>
private readonly Dictionary<string, DateTime> _cursors = new();
/// <summary>
/// Per-site count of consecutive non-draining cycles. Resets to zero on the
/// first draining (or empty) cycle.
/// </summary>
private readonly Dictionary<string, int> _nonDrainingCycles = new();
/// <summary>
/// Per-site latched stalled state — used so the actor only publishes a
/// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
/// stalled flag actually changes, not on every tick while stalled.
/// </summary>
private readonly Dictionary<string, bool> _stalled = new();
/// <summary>
/// AuditLog-004: per-EventId retry counter for rows whose central insert
/// threw. While a row keeps failing AND is below
/// <see cref="MaxPermanentInsertAttempts"/>, the cursor is held back so the
/// next reconciliation tick re-pulls and retries the row. Crossing the
/// threshold logs Critical and permanently abandons the row (cursor
/// advances past it) so a truly broken row cannot block all subsequent
/// progress for a site. The counter is in-memory only — singleton restart
/// resets it, which is safe because the cursor also resets on restart and
/// the next tick re-pulls everything.
/// </summary>
private readonly Dictionary<Guid, int> _failedInsertAttempts = new();
/// <summary>
/// AuditLog-004: number of consecutive central-insert failures before a row
/// is permanently abandoned with a Critical log entry and the cursor is
/// allowed to advance past it. Five attempts at the 5-minute default tick
/// is ~25 min of retry budget before a stuck row stops blocking progress.
/// </summary>
private const int MaxPermanentInsertAttempts = 5;
private ICancelable? _timer;
/// <summary>
/// Initializes the reconciliation actor with its dependencies and registers the tick handler.
/// </summary>
/// <param name="sites">Enumerates the known sites to reconcile.</param>
/// <param name="client">Client used to pull audit events from individual sites.</param>
/// <param name="services">Root service provider for opening a per-tick DI scope.</param>
/// <param name="options">Reconciliation configuration (interval, page size).</param>
/// <param name="logger">Logger for reconciliation diagnostics.</param>
public SiteAuditReconciliationActor(
ISiteEnumerator sites,
IPullAuditEventsClient client,
IServiceProvider services,
IOptions<SiteAuditReconciliationOptions> options,
ILogger<SiteAuditReconciliationActor> logger)
{
ArgumentNullException.ThrowIfNull(sites);
ArgumentNullException.ThrowIfNull(client);
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(options);
ArgumentNullException.ThrowIfNull(logger);
_sites = sites;
_client = client;
_services = services;
_options = options.Value;
_logger = logger;
ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
var interval = _options.ReconciliationInterval;
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
initialDelay: interval,
interval: interval,
receiver: Self,
message: ReconciliationTick.Instance,
sender: Self);
}
/// <inheritdoc />
protected override void PostStop()
{
_timer?.Cancel();
base.PostStop();
}
private async Task OnTickAsync()
{
// Capture EventStream BEFORE the first await. Accessing Context (and
// therefore Context.System) after an await is unsafe because Akka's
// ActorBase.Context throws "no active ActorContext" once the
// continuation runs on a thread that isn't currently dispatching this
// actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the
// AuditLogIngestActor.OnIngestAsync Sender-capture pattern.
var eventStream = Context.System.EventStream;
IReadOnlyList<SiteEntry> sites;
try
{
sites = await _sites.EnumerateAsync().ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
return;
}
if (sites.Count == 0)
{
return;
}
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
await using var scope = _services.CreateAsyncScope();
IAuditLogRepository repository;
try
{
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
return;
}
foreach (var site in sites)
{
try
{
await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false);
}
catch (Exception ex)
{
// Catch-all per the failure-isolation invariant: one site's
// fault must not sink the rest of the tick. The cursor for
// the failing site is left at its previous value so the
// next tick retries the same window.
_logger.LogWarning(
ex,
"Reconciliation pull failed for site {SiteId}; other sites continue.",
site.SiteId);
}
}
}
/// <summary>
/// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
/// returned rows idempotently into the central repository, and advances
/// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
/// observed. The brief's "saturate until backlog clears" intent is met by
/// the natural cadence — each tick issues one pull, and a backed-up site
/// drains across consecutive ticks. The stalled signal (two non-draining
/// ticks in a row) surfaces when that drain isn't keeping up.
/// </summary>
private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream)
{
var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
var response = await _client.PullAsync(
site.SiteId, since, _options.BatchSize, CancellationToken.None)
.ConfigureAwait(false);
var maxOccurred = since;
var hasUnresolvedFailure = false;
var nowUtc = DateTime.UtcNow;
foreach (var evt in response.Events)
{
var advanceForThisRow = false;
try
{
// Idempotent repository write: duplicate EventIds (from a
// concurrent push, or a retry of this very pull) collapse to
// a no-op courtesy of M2 Bundle A's race-fix on
// InsertIfNotExistsAsync.
var ingested = evt with { IngestedAtUtc = nowUtc };
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
_failedInsertAttempts.Remove(evt.EventId);
advanceForThisRow = true;
}
catch (Exception ex)
{
// AuditLog-004: per-row catch so one bad event does not abandon
// the rest of the batch. Track the failure count per EventId —
// below MaxPermanentInsertAttempts the cursor is HELD BACK so
// the next tick re-pulls and retries; at the threshold the row
// is permanently abandoned (LogCritical + cursor advances past)
// to keep a truly broken row from blocking all subsequent
// progress for the site.
var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1;
_failedInsertAttempts[evt.EventId] = attempts;
if (attempts >= MaxPermanentInsertAttempts)
{
_logger.LogCritical(
ex,
"Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.",
evt.EventId,
site.SiteId,
attempts);
_failedInsertAttempts.Remove(evt.EventId);
advanceForThisRow = true;
}
else
{
_logger.LogError(
ex,
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.",
evt.EventId,
site.SiteId,
attempts,
MaxPermanentInsertAttempts);
hasUnresolvedFailure = true;
}
}
if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
{
maxOccurred = evt.OccurredAtUtc;
}
}
// AuditLog-004: only advance the persisted cursor if no event in this
// batch is still being retried. Leaving the cursor at `since` re-pulls
// the whole batch next tick — successful rows are no-ops thanks to
// InsertIfNotExistsAsync's idempotency, and the failing row gets
// another attempt. Once it succeeds (or hits the permanent-abandon
// threshold) the cursor unblocks naturally.
_cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred;
var nonDraining = response.MoreAvailable && response.Events.Count > 0;
UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
}
/// <summary>
/// Flips the per-site stalled flag based on whether this tick drained the
/// queue. A "draining" cycle is one where the server reported no more rows
/// available OR returned zero events. A "non-draining" cycle is the
/// inverse (events returned AND <c>MoreAvailable=true</c>).
/// </summary>
/// <remarks>
/// The state machine: counter increments on each consecutive non-draining
/// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
/// the actor latches <c>Stalled=true</c> and publishes the transition; on
/// any subsequent draining tick the counter resets to zero AND, if the
/// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
/// transitions are published — repeated ticks in the same state are
/// silent so a downstream subscriber doesn't see a flood of redundant
/// notifications.
/// </remarks>
private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream)
{
var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
if (draining)
{
_nonDrainingCycles[siteId] = 0;
if (wasStalled)
{
_stalled[siteId] = false;
eventStream.Publish(
new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
}
return;
}
var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
_nonDrainingCycles[siteId] = consecutive;
if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
{
_stalled[siteId] = true;
eventStream.Publish(
new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
}
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: 0,
withinTimeRange: TimeSpan.Zero,
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
}
/// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
internal sealed class ReconciliationTick
{
public static readonly ReconciliationTick Instance = new();
private ReconciliationTick() { }
}
}
/// <summary>
/// Published on the actor system EventStream when a site's reconciliation
/// puller transitions into or out of the "stalled" state (backlog not
/// draining across multiple cycles). The M6 Bundle E central health collector
/// will subscribe to this and surface
/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
/// </summary>
public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);
@@ -0,0 +1,60 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central <see cref="SiteAuditReconciliationActor"/> singleton.
/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per
/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty
/// AND <c>MoreAvailable=true</c> (the backlog is not draining).
/// </summary>
/// <remarks>
/// <para>
/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is
/// lost; it is intentionally low-frequency. Lowering
/// <see cref="ReconciliationIntervalSeconds"/> in production trades MS SQL load for
/// fresher self-healing — keep the default unless a deployment can prove the extra
/// load is acceptable.
/// </para>
/// <para>
/// <see cref="StalledAfterNonDrainingCycles"/> = 2 because a single non-draining
/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the
/// stalled signal should only fire when the backlog persists across cycles, which is
/// the symptom the central health surface is asking us to detect.
/// </para>
/// </remarks>
public sealed class SiteAuditReconciliationOptions
{
/// <summary>
/// Period of the reconciliation tick. Each tick visits every known site once.
/// </summary>
public int ReconciliationIntervalSeconds { get; set; } = 300;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-second resolution allows. When non-null, takes precedence over
/// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
/// production config exposes <see cref="ReconciliationIntervalSeconds"/>
/// only.
/// </summary>
public TimeSpan? ReconciliationIntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override when
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
/// </summary>
public TimeSpan ReconciliationInterval =>
ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
/// <summary>
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
/// rows requested in a single <c>PullAuditEvents</c> RPC call.
/// </summary>
public int BatchSize { get; set; } = 256;
/// <summary>
/// Number of consecutive non-draining cycles (events returned AND
/// <c>MoreAvailable=true</c>) that must accumulate for a site before the actor
/// publishes <c>SiteAuditTelemetryStalledChanged(Stalled: true)</c> on the
/// EventStream.
/// </summary>
public int StalledAfterNonDrainingCycles { get; set; } = 2;
}
@@ -0,0 +1,203 @@
using System.Collections.Concurrent;
using Akka.Actor;
using Akka.Event;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the
/// actor system's EventStream for <see cref="SiteAuditTelemetryStalledChanged"/>
/// publications and maintains a per-site latched stalled-state map readable
/// via <see cref="Snapshot"/>. Consumed by the M6 Bundle E
/// <see cref="AuditCentralHealthSnapshot"/> aggregator so the central health
/// surface can surface per-site "reconciliation isn't draining" without
/// coupling the publisher (<see cref="SiteAuditReconciliationActor"/>) to the
/// health collection plumbing.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why an internal actor.</b> Akka.NET's <see cref="EventStream"/> only
/// supports <see cref="IActorRef"/> subscribers — there is no callback or
/// channel-based overload. The tracker therefore spawns a small subscriber
/// actor that forwards each event into the shared
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> on the actor's thread, and
/// readers (<see cref="Snapshot"/>) take a copy off that dictionary on any
/// thread. Mirrors the <c>DeadLetterMonitorActor</c> shape — subscribe in
/// <see cref="ActorBase.PreStart"/>, unsubscribe in
/// <see cref="ActorBase.PostStop"/>, which the tracker triggers via a Stop
/// at <see cref="Dispose"/>.
/// </para>
/// <para>
/// <b>Per-site latching.</b> The publisher (<see cref="SiteAuditReconciliationActor"/>)
/// only publishes on stalled-state transitions, so the dictionary is the
/// authoritative latched state. Sites that have never published are absent
/// from the snapshot — the consumer surface treats absence as
/// <c>Stalled=false</c> (default healthy), the same default the reconciliation
/// actor's own internal latch uses.
/// </para>
/// <para>
/// <b>Singleton lifecycle.</b> Registered as a singleton via
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>;
/// <see cref="Dispose"/> tears the internal subscriber down at host shutdown.
/// </para>
/// </remarks>
public sealed class SiteAuditTelemetryStalledTracker : IDisposable
{
private readonly EventStream _eventStream;
private readonly ConcurrentDictionary<string, bool> _state = new();
private readonly IActorRef? _subscriber;
private readonly AuditCentralHealthSnapshot? _snapshot;
private bool _disposed;
/// <summary>
/// Construct around a bare <see cref="EventStream"/>. Intended for unit
/// tests where the caller wants to publish events without standing up an
/// actor system — the tracker registers a transient subscriber actor only
/// if the supplied stream is backed by an actor system. In the bare-stream
/// mode (no actor system) the tracker still exposes the
/// <see cref="Snapshot"/> surface but cannot self-subscribe; production
/// callers always go through <see cref="SiteAuditTelemetryStalledTracker(ActorSystem)"/>.
/// </summary>
/// <remarks>
/// Subscribing to <see cref="EventStream"/> requires an <see cref="IActorRef"/>,
/// which can only be created from an <see cref="ActorSystem"/>. The bare-
/// stream ctor therefore can NOT itself wire the subscriber — tests that
/// want event-driven updates must use the ActorSystem ctor (or push state
/// directly via <see cref="Apply"/>). The tests in
/// <c>SiteAuditTelemetryStalledTrackerTests</c> use the ActorSystem ctor
/// via Akka.TestKit so they exercise the production subscribe path.
/// </remarks>
/// <param name="eventStream">The actor system event stream to observe.</param>
public SiteAuditTelemetryStalledTracker(EventStream eventStream)
: this(eventStream, snapshot: null)
{
}
/// <summary>
/// Bare-stream ctor with an optional snapshot sink — the central
/// composition root passes the singleton
/// <see cref="AuditCentralHealthSnapshot"/> so every dictionary update
/// also lands on the central health surface. The bare ctor still cannot
/// subscribe (no actor system), but tests that drive the tracker via
/// <see cref="Apply"/> get the snapshot push for free.
/// </summary>
/// <param name="eventStream">The actor system event stream to observe.</param>
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot)
{
_eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream));
// No subscriber actor — see the remarks on the parameterless overload.
_subscriber = null;
_snapshot = snapshot;
}
/// <summary>
/// Production ctor: subscribes a small internal actor to the supplied
/// system's EventStream so every published
/// <see cref="SiteAuditTelemetryStalledChanged"/> updates the latched
/// per-site map. <see cref="Dispose"/> tears the subscriber down.
/// </summary>
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem)
: this(actorSystem, snapshot: null)
{
}
/// <summary>
/// Production ctor with a snapshot sink — every observed
/// <see cref="SiteAuditTelemetryStalledChanged"/> is mirrored onto the
/// shared <see cref="AuditCentralHealthSnapshot"/> so the central health
/// surface sees per-site stalled state without re-reading the tracker.
/// </summary>
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot)
{
ArgumentNullException.ThrowIfNull(actorSystem);
_eventStream = actorSystem.EventStream;
_snapshot = snapshot;
// Anonymous subscriber actor scoped to the system; props build it
// with a callback into THIS tracker's Apply method so the actor's
// single-threaded receive serialises every dictionary write.
_subscriber = actorSystem.ActorOf(
Props.Create(() => new StalledChangedSubscriber(this)),
name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}");
// Subscribe synchronously from the ctor so the subscription is in
// place before the tracker is returned to the caller — the actor's
// own PreStart runs asynchronously and would otherwise race the
// first publish. EventStream.Subscribe is thread-safe.
_eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged));
}
/// <summary>
/// Returns a defensive copy of the per-site latched stalled state.
/// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
/// </summary>
public IReadOnlyDictionary<string, bool> Snapshot() =>
new Dictionary<string, bool>(_state);
/// <summary>
/// Applied by the internal subscriber actor on every
/// <see cref="SiteAuditTelemetryStalledChanged"/> publication. Exposed
/// internally so tests against the bare-stream ctor can still drive the
/// tracker, but the production path always goes through the actor.
/// </summary>
/// <param name="evt">The stalled-state change event to apply.</param>
internal void Apply(SiteAuditTelemetryStalledChanged evt)
{
if (evt is null) return;
_state[evt.SiteId] = evt.Stalled;
// Mirror into the central health snapshot if wired so a reader of
// IAuditCentralHealthSnapshot sees the same per-site state without
// a second lookup. Snapshot is optional (test composition roots may
// skip it) so the null-coalesce is the safe path.
_snapshot?.ApplyStalled(evt);
}
/// <summary>
/// Disposes the tracker and tears down the internal subscriber actor.
/// </summary>
public void Dispose()
{
if (_disposed) return;
_disposed = true;
if (_subscriber is not null)
{
// Unsubscribe runs in PostStop on the subscriber actor; Stop is
// fire-and-forget but the actor's PostStop hook is guaranteed to
// run before its mailbox is collected.
_subscriber.Tell(PoisonPill.Instance);
}
}
/// <summary>
/// Internal subscriber actor — receives every
/// <see cref="SiteAuditTelemetryStalledChanged"/> off the EventStream and
/// forwards it into the parent <see cref="SiteAuditTelemetryStalledTracker"/>.
/// Unlike <c>DeadLetterMonitorActor</c>, the subscription is registered by
/// the tracker constructor BEFORE this actor begins processing messages so
/// publishes that arrive between actor creation and PreStart cannot be
/// missed. Unsubscribe still runs in <see cref="PostStop"/>.
/// </summary>
private sealed class StalledChangedSubscriber : ReceiveActor
{
private readonly SiteAuditTelemetryStalledTracker _parent;
/// <summary>
/// Initializes a new subscriber actor that forwards events to the given tracker.
/// </summary>
/// <param name="parent">The parent tracker whose <see cref="Apply"/> method will be called for each event.</param>
public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent)
{
_parent = parent;
Receive<SiteAuditTelemetryStalledChanged>(evt => _parent.Apply(evt));
}
/// <inheritdoc />
protected override void PostStop()
{
Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged));
base.PostStop();
}
}
}
@@ -0,0 +1,49 @@
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
/// <summary>
/// Configuration for Audit Log (#23). Bound from the <c>AuditLog</c> section of
/// <c>appsettings.json</c>. Defaults reflect the design (alog.md §6, §10): an
/// 8 KiB payload-summary cap, a 64 KiB cap on error rows, and a 365-day central
/// retention window with monthly partition-switch purge. The default
/// header-redact list covers HTTP auth headers; per-target overrides extend
/// (never replace) the global redactor set.
/// </summary>
public sealed class AuditLogOptions
{
/// <summary>Default payload-summary cap in bytes (default 8 KiB).</summary>
public int DefaultCapBytes { get; set; } = 8192;
/// <summary>Payload-summary cap on error rows in bytes (default 64 KiB).</summary>
public int ErrorCapBytes { get; set; } = 65536;
/// <summary>HTTP headers redacted by default before persistence.</summary>
public List<string> HeaderRedactList { get; set; } = new()
{
"Authorization",
"X-Api-Key",
"Cookie",
"Set-Cookie",
};
/// <summary>Body-content redactors applied globally (regex patterns).</summary>
public List<string> GlobalBodyRedactors { get; set; } = new();
/// <summary>Per-target redaction overrides keyed by target identifier.</summary>
public Dictionary<string, PerTargetRedactionOverride> PerTargetOverrides { get; set; } = new();
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
public int RetentionDays { get; set; } = 365;
/// <summary>
/// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
/// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
/// (default 1 MiB). The 8 KiB / 64 KiB default/error caps that apply to other channels
/// do not apply here — inbound traffic captures verbatim up to this ceiling and only
/// then sets <see cref="AuditEvent.PayloadTruncated"/>. See
/// <c>docs/plans/2026-05-23-inbound-api-full-response-audit-design.md</c>.
/// </summary>
public int InboundMaxBytes { get; set; } = 1_048_576;
}
@@ -0,0 +1,70 @@
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
/// <summary>
/// Validates <see cref="AuditLogOptions"/> on startup. The caps drive payload
/// truncation in the M2+ writers, so an unset/zero cap would let arbitrarily
/// large blobs into the central <c>AuditLog</c> table. <see cref="AuditLogOptions.ErrorCapBytes"/>
/// must be at least as large as <see cref="AuditLogOptions.DefaultCapBytes"/>
/// because the error cap is meant to capture <em>more</em> detail than the
/// happy-path summary, not less. <see cref="AuditLogOptions.RetentionDays"/> is
/// bounded to <c>[30, 3650]</c> to keep purge windows sane: too short would
/// drop in-flight investigations, too long would defeat the partition-switch
/// purge's purpose.
/// </summary>
public sealed class AuditLogOptionsValidator : IValidateOptions<AuditLogOptions>
{
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
public const int MinRetentionDays = 30;
/// <summary>Inclusive upper bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
public const int MaxRetentionDays = 3650;
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.InboundMaxBytes"/> (8 KiB).</summary>
public const int MinInboundMaxBytes = 8_192;
/// <summary>Inclusive upper bound for <see cref="AuditLogOptions.InboundMaxBytes"/> (16 MiB).</summary>
public const int MaxInboundMaxBytes = 16_777_216;
/// <inheritdoc />
public ValidateOptionsResult Validate(string? name, AuditLogOptions options)
{
ArgumentNullException.ThrowIfNull(options);
var failures = new List<string>();
if (options.DefaultCapBytes <= 0)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}) " +
"must be > 0; it drives payload-summary truncation in audit writers.");
}
if (options.ErrorCapBytes < options.DefaultCapBytes)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.ErrorCapBytes)} ({options.ErrorCapBytes}) " +
$"must be >= {nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}); " +
"the error-row cap is intended to capture more detail than the happy-path summary.");
}
if (options.RetentionDays < MinRetentionDays || options.RetentionDays > MaxRetentionDays)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.RetentionDays)} ({options.RetentionDays}) " +
$"must be in [{MinRetentionDays}, {MaxRetentionDays}] days.");
}
if (options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
}
return failures.Count == 0
? ValidateOptionsResult.Success
: ValidateOptionsResult.Fail(failures);
}
}
@@ -0,0 +1,28 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
/// <summary>
/// Per-target redaction override applied additively on top of
/// <see cref="AuditLogOptions.GlobalBodyRedactors"/> and the
/// <see cref="AuditLogOptions.DefaultCapBytes"/> / <see cref="AuditLogOptions.ErrorCapBytes"/>
/// caps. Targets are identified by the script-facing external-system /
/// database / notification-list / inbound-API-key name.
/// </summary>
public sealed class PerTargetRedactionOverride
{
/// <summary>Optional payload cap override (bytes); null inherits the global cap.</summary>
public int? CapBytes { get; set; }
/// <summary>Additional body redactor regex patterns (appended to the global list).</summary>
public List<string>? AdditionalBodyRedactors { get; set; }
/// <summary>
/// Opt-in SQL parameter redaction: case-insensitive regex matched against
/// each SQL parameter NAME in the M4 <c>AuditingDbCommand</c> RequestSummary
/// JSON (<c>{"sql":"...","parameters":{"@name":"value", ...}}</c>); values
/// whose name matches are replaced with <c>&lt;redacted&gt;</c>. Null (the
/// default) means parameter values are captured verbatim. Only applied to
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditChannel.DbOutbound"/>
/// rows.
/// </summary>
public string? RedactSqlParamsMatching { get; set; }
}
@@ -0,0 +1,587 @@
using System.Collections.Concurrent;
using System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
using System.Text.Json.Nodes;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Default <see cref="IAuditPayloadFilter"/>. Bundle A established the
/// truncation backbone; Bundle B chains HTTP header redaction (M5-T3) BEFORE
/// truncation so redactors operate on the full payload and the cap then trims
/// the redacted result.
/// </summary>
/// <remarks>
/// <para>
/// Uses <see cref="IOptionsMonitor{TOptions}"/> (not <see cref="IOptions{TOptions}"/>)
/// so the M5-T8 hot-reload path sees fresh values without re-resolving the
/// singleton. <see cref="Apply"/> reads <see cref="IOptionsMonitor{T}.CurrentValue"/>
/// on every call, and the regex cache is keyed by pattern string — patterns
/// added via a live config change compile on first use of the next event;
/// patterns removed simply stop being looked up. No <c>OnChange</c> subscription
/// or explicit cache invalidation is required (the
/// <c>AuditLogOptionsBindingTests</c> fixture in <c>ZB.MOM.WW.ScadaBridge.AuditLog.Tests</c>
/// pins this behaviour).
/// </para>
/// <para>
/// "Error row" = <see cref="AuditEvent.Status"/> NOT IN (<c>Delivered</c>,
/// <c>Submitted</c>, <c>Forwarded</c>) — every other status, including the
/// non-terminal <c>Attempted</c>, the parked/discarded terminals, and the
/// short-circuit <c>Skipped</c>, receives the larger error cap so a verbose
/// error body survives.
/// </para>
/// <para>
/// Apply MUST NOT throw — on internal failure the filter over-redacts by
/// returning the input with <see cref="AuditEvent.PayloadTruncated"/> set and
/// increments the <c>AuditRedactionFailure</c> health metric via the injected
/// <see cref="IAuditRedactionFailureCounter"/>. Each redactor stage runs in
/// its own try/catch — a failure in (say) the header redactor still lets the
/// SQL parameter redactor and the truncator run on the remaining fields.
/// </para>
/// <para>
/// Stage order (each runs on every applicable field):
/// header redaction → body regex redaction → truncation. The SQL-parameter
/// stage piggybacks on the body-redactor path; both run BEFORE truncation so
/// the cap trims the redacted result, never bytes the redactor intended to
/// hide.
/// </para>
/// </remarks>
public sealed class DefaultAuditPayloadFilter : IAuditPayloadFilter
{
private const string RedactedMarker = "<redacted>";
private const string RedactorErrorMarker = "<redacted: redactor error>";
/// <summary>
/// Per-match regex timeout. Catastrophic-backtracking patterns trip a
/// <see cref="RegexMatchTimeoutException"/> when a single match takes
/// longer than this; the offending field is then over-redacted with
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
/// 50 ms is generous for normal patterns yet short enough that the
/// audit hot-path isn't held up by a misconfigured regex.
/// </summary>
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.FromMilliseconds(50);
/// <summary>
/// JSON serializer options used to re-emit redacted summaries. The
/// UnsafeRelaxedJsonEscaping encoder is required so the redaction marker
/// (which contains <c>&lt;</c> / <c>&gt;</c>) survives unescaped — the
/// header-redaction tests grep for the literal marker, and the downstream
/// UI / log readers would rather see <c>&lt;redacted&gt;</c> than
/// <c><redacted></c>. The summaries are persisted to the audit
/// table and rendered in trusted-internal contexts only, so the relaxed
/// HTML-escaping rules do not introduce an XSS surface.
/// </summary>
private static readonly JsonSerializerOptions RedactedSummaryJsonOptions = new()
{
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
};
private readonly IOptionsMonitor<AuditLogOptions> _options;
private readonly ILogger<DefaultAuditPayloadFilter> _logger;
private readonly IAuditRedactionFailureCounter _failureCounter;
/// <summary>
/// Compiled-regex cache keyed by pattern string. Lazy population: each
/// pattern is compiled on first use and cached forever (the entry's
/// <see cref="CompiledRegex"/> carries either the working <see cref="Regex"/>
/// or a sentinel marking the pattern as invalid so we don't retry the
/// failing compile on every call). ConcurrentDictionary is the right
/// thread-safety primitive here because the filter is a DI singleton
/// shared across the audit hot-path.
/// </summary>
private readonly ConcurrentDictionary<string, CompiledRegex> _regexCache = new();
/// <summary>
/// Primary constructor used by DI — pulls the optional redaction-failure
/// counter from the container; a NoOp default is registered in
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
/// </summary>
/// <param name="options">Live-reloadable audit log options.</param>
/// <param name="logger">Logger for redaction diagnostics.</param>
/// <param name="failureCounter">Optional counter incremented when a redaction operation fails; defaults to a no-op.</param>
public DefaultAuditPayloadFilter(
IOptionsMonitor<AuditLogOptions> options,
ILogger<DefaultAuditPayloadFilter> logger,
IAuditRedactionFailureCounter? failureCounter = null)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_failureCounter = failureCounter ?? new NoOpAuditRedactionFailureCounter();
}
/// <inheritdoc />
public AuditEvent Apply(AuditEvent rawEvent)
{
try
{
var opts = _options.CurrentValue;
// Inbound API gets a dedicated, larger ceiling — request/response bodies are
// captured verbatim up to InboundMaxBytes (default 1 MiB) so support can
// replay exactly what the caller sent and what we returned. Other channels
// keep the global 8 KiB / 64 KiB policy.
// See docs/plans/2026-05-23-inbound-api-full-response-audit-design.md.
var cap = rawEvent.Channel == AuditChannel.ApiInbound
? opts.InboundMaxBytes
: (IsErrorStatus(rawEvent.Status) ? opts.ErrorCapBytes : opts.DefaultCapBytes);
// --- Header-redaction stage (runs BEFORE truncation) ----------
var request = RedactHeaders(rawEvent.RequestSummary, opts.HeaderRedactList);
var response = RedactHeaders(rawEvent.ResponseSummary, opts.HeaderRedactList);
var errorDetail = rawEvent.ErrorDetail;
var extra = rawEvent.Extra;
// --- Body-regex stage (also runs BEFORE truncation) -----------
// Resolves the active regex set per event so per-target overrides
// bound to AuditEvent.Target are picked up; effectively a no-op
// when neither GlobalBodyRedactors nor the per-target additions
// are configured.
var bodyRegexes = ResolveBodyRegexes(opts, rawEvent.Target);
if (bodyRegexes.Count > 0)
{
request = RedactBody(request, bodyRegexes);
response = RedactBody(response, bodyRegexes);
errorDetail = RedactBody(errorDetail, bodyRegexes);
extra = RedactBody(extra, bodyRegexes);
}
// --- SQL parameter redaction stage (DbOutbound only) ----------
// Parses the M4 AuditingDbCommand RequestSummary shape
// {"sql":"...","parameters":{...}} and redacts parameter VALUES
// whose NAME matches the per-connection regex. Opt-in: no
// PerTargetOverrides[connectionName].RedactSqlParamsMatching =>
// no-op. Channel-guarded so the same regex can never accidentally
// touch an ApiOutbound row.
if (rawEvent.Channel == AuditChannel.DbOutbound
&& TryGetSqlParamRedactor(opts, rawEvent.Target, out var sqlParamRegex))
{
request = RedactSqlParameters(request, sqlParamRegex!);
}
// --- Truncation stage -----------------------------------------
var truncated = false;
request = TruncateField(request, cap, ref truncated);
response = TruncateField(response, cap, ref truncated);
errorDetail = TruncateField(errorDetail, cap, ref truncated);
extra = TruncateField(extra, cap, ref truncated);
return rawEvent with
{
RequestSummary = request,
ResponseSummary = response,
ErrorDetail = errorDetail,
Extra = extra,
PayloadTruncated = rawEvent.PayloadTruncated || truncated,
};
}
catch (Exception ex)
{
// Audit is best-effort: over-redact rather than fail the caller.
// The per-stage try/catches above already handle redactor faults
// and increment the counter; this catch covers any unexpected
// surprise in the surrounding orchestration code.
_logger.LogWarning(
ex,
"Payload filter failed; returning raw event with PayloadTruncated=true");
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return rawEvent with { PayloadTruncated = true };
}
}
/// <summary>
/// Parse <paramref name="json"/> as the documented
/// <c>{"headers": {...}, "body": ...}</c> shape and replace values whose
/// header NAME (case-insensitive) is in <paramref name="redactList"/> with
/// <see cref="RedactedMarker"/>. Re-serialises and returns the result.
/// </summary>
/// <remarks>
/// No-op pass-through for inputs that aren't JSON-shaped — emitters that
/// have not yet adopted the convention (the M2 site emitters today, which
/// leave RequestSummary null on outbound API calls) get a transparent
/// pass. If the redactor itself throws, we over-redact the whole field
/// with <see cref="RedactorErrorMarker"/> and bump the failure counter.
/// </remarks>
private string? RedactHeaders(string? json, IList<string> redactList)
{
if (json is null)
{
return null;
}
// Cheap structural pre-check: only attempt JSON parsing when the input
// actually looks like a JSON object. Saves the JsonDocument allocation
// on the (very common) non-JSON ErrorDetail / Extra fields.
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
// Not parseable JSON — leave the field alone (no error, no
// redaction). Emitters not yet using the documented shape get
// a transparent pass; Bundle C will update them.
return json;
}
if (root is not JsonObject obj || obj["headers"] is not JsonObject headers)
{
// No "headers" object at the top level — nothing to redact.
return json;
}
// Build a case-insensitive lookup of the redact list so we can do
// one O(1) check per header name without an inner Any() loop.
var redactSet = new HashSet<string>(redactList, StringComparer.OrdinalIgnoreCase);
// Take a snapshot of names first — we cannot mutate while
// enumerating the JsonObject.
var names = new List<string>(headers.Count);
foreach (var kvp in headers)
{
names.Add(kvp.Key);
}
foreach (var name in names)
{
if (redactSet.Contains(name))
{
headers[name] = JsonValue.Create(RedactedMarker);
}
}
return obj.ToJsonString(RedactedSummaryJsonOptions);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Header redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
/// <summary>
/// Combine the global and per-target body-redactor lists for a single
/// event, returning the compiled-regex set to apply. Patterns that failed
/// compilation are silently skipped — the compile-time failure was logged
/// once on first encounter; we never let one bad pattern starve the rest.
/// </summary>
private IReadOnlyList<Regex> ResolveBodyRegexes(AuditLogOptions opts, string? target)
{
var hasGlobal = opts.GlobalBodyRedactors is { Count: > 0 };
var perTargetAdditions = (target != null
&& opts.PerTargetOverrides.TryGetValue(target, out var over)
&& over.AdditionalBodyRedactors is { Count: > 0 })
? over.AdditionalBodyRedactors
: null;
if (!hasGlobal && perTargetAdditions == null)
{
return Array.Empty<Regex>();
}
var result = new List<Regex>();
if (hasGlobal)
{
foreach (var pattern in opts.GlobalBodyRedactors)
{
if (TryGetCompiledRegex(pattern, out var rx))
{
result.Add(rx!);
}
}
}
if (perTargetAdditions != null)
{
foreach (var pattern in perTargetAdditions)
{
if (TryGetCompiledRegex(pattern, out var rx))
{
result.Add(rx!);
}
}
}
return result;
}
/// <summary>
/// Resolve a compiled regex from the cache, compiling it on first use.
/// Returns <c>false</c> for patterns that are invalid OR whose compile
/// took longer than 100 ms (the spec calls catastrophic-backtracking
/// guesses at compile time "invalid"); the failure is logged once and
/// the sentinel cache entry prevents repeat compile attempts.
/// </summary>
private bool TryGetCompiledRegex(string pattern, out Regex? regex)
{
var entry = _regexCache.GetOrAdd(pattern, CompileRegex);
regex = entry.Regex;
return entry.Regex != null;
}
private CompiledRegex CompileRegex(string pattern)
{
try
{
var swStart = System.Diagnostics.Stopwatch.GetTimestamp();
var rx = new Regex(pattern, RegexOptions.Compiled, RegexMatchTimeout);
var elapsedMs = (System.Diagnostics.Stopwatch.GetTimestamp() - swStart)
* 1000d / System.Diagnostics.Stopwatch.Frequency;
if (elapsedMs > 100)
{
_logger.LogWarning(
"Body redactor pattern compiled in {Elapsed}ms (> 100ms cap); rejecting '{Pattern}'",
elapsedMs, pattern);
return CompiledRegex.Invalid;
}
return new CompiledRegex(rx);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Body redactor pattern '{Pattern}' failed to compile; skipping",
pattern);
return CompiledRegex.Invalid;
}
}
/// <summary>
/// Apply each compiled body-redactor regex to <paramref name="value"/> in
/// turn, replacing every match with <see cref="RedactedMarker"/>. If any
/// single regex match throws (most commonly
/// <see cref="RegexMatchTimeoutException"/>) the field is over-redacted
/// with <see cref="RedactorErrorMarker"/> and the failure counter is
/// incremented — the user-facing action is never aborted.
/// </summary>
private string? RedactBody(string? value, IReadOnlyList<Regex> regexes)
{
if (value is null)
{
return null;
}
var current = value;
foreach (var rx in regexes)
{
try
{
current = rx.Replace(current, RedactedMarker);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Body redactor '{Pattern}' faulted; over-redacting field with '{Marker}'",
rx.ToString(), RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
return current;
}
/// <summary>
/// Resolve the per-connection SQL parameter redaction regex for the given
/// DbOutbound event target. Target shape (M4 AuditingDbCommand): the
/// connection name optionally followed by <c>.&lt;sql-snippet&gt;</c> for
/// disambiguation; the per-target dictionary is keyed by the connection
/// name alone, so we strip the snippet suffix before lookup. Patterns are
/// compiled with case-insensitive matching to match the documented
/// behaviour.
/// </summary>
private bool TryGetSqlParamRedactor(AuditLogOptions opts, string? target, out Regex? regex)
{
regex = null;
if (string.IsNullOrEmpty(target))
{
return false;
}
var dot = target.IndexOf('.');
var connectionKey = dot < 0 ? target : target[..dot];
if (!opts.PerTargetOverrides.TryGetValue(connectionKey, out var over)
|| string.IsNullOrEmpty(over.RedactSqlParamsMatching))
{
return false;
}
// Force case-insensitivity per the spec — even if the operator wrote
// the pattern without an IgnoreCase flag. The compile cache key folds
// the option to keep the entries unambiguous.
var cacheKey = "(?i)" + over.RedactSqlParamsMatching;
if (!TryGetCompiledRegex(cacheKey, out regex))
{
return false;
}
return true;
}
/// <summary>
/// Walk the M4 <c>{"sql":"...","parameters":{...}}</c> RequestSummary
/// shape; for each parameter whose NAME matches
/// <paramref name="paramNameRegex"/>, replace its value with
/// <see cref="RedactedMarker"/>. Re-serialise.
/// </summary>
/// <remarks>
/// No-op pass-through when the input isn't parseable JSON, isn't a JSON
/// object, or doesn't carry a top-level <c>"parameters"</c> object. On
/// any unexpected fault the field is over-redacted with
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
/// </remarks>
private string? RedactSqlParameters(string? json, Regex paramNameRegex)
{
if (json is null)
{
return null;
}
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
return json;
}
if (root is not JsonObject obj || obj["parameters"] is not JsonObject parameters)
{
return json;
}
// Snapshot the names — mutating during enumeration is unsupported.
var names = new List<string>(parameters.Count);
foreach (var kvp in parameters)
{
names.Add(kvp.Key);
}
var anyChanged = false;
foreach (var name in names)
{
bool matched;
try
{
matched = paramNameRegex.IsMatch(name);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
if (matched)
{
parameters[name] = JsonValue.Create(RedactedMarker);
anyChanged = true;
}
}
// Avoid re-serialising (which would normalise whitespace / order)
// when no parameter matched — keeps the on-disk row byte-identical
// to the emitter's output on the no-match path.
return anyChanged ? obj.ToJsonString(RedactedSummaryJsonOptions) : json;
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
private static string? TruncateField(string? value, int cap, ref bool truncated)
{
if (value is null)
{
return null;
}
var result = TruncateUtf8(value, cap);
if (result.Length != value.Length)
{
truncated = true;
}
return result;
}
/// <summary>
/// UTF-8 byte-safe truncation. Encodes the input to UTF-8, walks back from
/// the cap position until the byte is NOT a continuation byte
/// (<c>byte &amp; 0xC0 == 0x80</c>), and decodes the resulting prefix —
/// guaranteeing the returned string never splits a multi-byte sequence.
/// </summary>
private static string TruncateUtf8(string value, int capBytes)
{
if (string.IsNullOrEmpty(value))
{
return value;
}
var bytes = Encoding.UTF8.GetBytes(value);
if (bytes.Length <= capBytes)
{
return value;
}
var boundary = capBytes;
while (boundary > 0 && (bytes[boundary] & 0xC0) == 0x80)
{
boundary--;
}
return Encoding.UTF8.GetString(bytes, 0, boundary);
}
private static bool IsErrorStatus(AuditStatus status) => status switch
{
AuditStatus.Delivered or AuditStatus.Submitted or AuditStatus.Forwarded => false,
_ => true,
};
/// <summary>
/// Cache entry for a body-redactor pattern. Carries the working
/// <see cref="Regex"/> on the success path, or the
/// <see cref="Invalid"/> sentinel for patterns that failed to compile
/// (or exceeded the 100 ms compile budget). The sentinel lets us skip
/// repeat compile attempts on every event without re-throwing on the
/// hot-path.
/// </summary>
private readonly struct CompiledRegex
{
public static readonly CompiledRegex Invalid = new(null);
/// <summary>Gets the compiled <see cref="System.Text.RegularExpressions.Regex"/>, or <c>null</c> when the pattern was invalid.</summary>
public Regex? Regex { get; }
/// <summary>Initializes a new <see cref="CompiledRegex"/> wrapping the given compiled regex instance.</summary>
/// <param name="regex">The pre-compiled regex, or <c>null</c> to represent an invalid pattern.</param>
public CompiledRegex(Regex? regex) => Regex = regex;
}
}
@@ -0,0 +1,31 @@
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Filters an <see cref="AuditEvent"/> between construction and persistence —
/// truncates oversized payload fields, applies header/body/SQL-parameter
/// redaction, sets <see cref="AuditEvent.PayloadTruncated"/>.
/// </summary>
/// <remarks>
/// <para>
/// Pure function: returns a filtered COPY of the input via <c>with</c>
/// expressions; never throws (over-redacts on internal failure and increments
/// the <c>AuditRedactionFailure</c> health metric).
/// </para>
/// <para>
/// Wired in M5 between event construction and the writer chain
/// (<c>FallbackAuditWriter.WriteAsync</c>, <c>CentralAuditWriter.WriteAsync</c>,
/// and the <c>AuditLogIngestActor</c> handlers).
/// </para>
/// </remarks>
public interface IAuditPayloadFilter
{
/// <summary>
/// Apply the configured truncation + redaction policy to <paramref name="rawEvent"/>
/// and return a filtered copy. MUST NOT throw — on internal failure, over-redact
/// and surface the failure via the audit-redaction-failure health metric.
/// </summary>
/// <param name="rawEvent">The unfiltered audit event to process.</param>
AuditEvent Apply(AuditEvent rawEvent);
}
@@ -0,0 +1,20 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Counter sink invoked by <see cref="DefaultAuditPayloadFilter"/> every time
/// a redactor (header / body regex / SQL parameter) throws and the filter has
/// to over-redact the offending field with the
/// <c>&lt;redacted: redactor error&gt;</c> marker. Bundle C bridges this into
/// the Site Health Monitoring report payload as <c>AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// Redaction failures must NEVER abort the user-facing action (alog.md §7) —
/// the filter over-redacts the field and surfaces the failure via this counter
/// instead. A NoOp default is the correct safe fallback while the health
/// metric is being wired in.
/// </remarks>
public interface IAuditRedactionFailureCounter
{
/// <summary>Increment the audit-redaction failure counter by one.</summary>
void Increment();
}
@@ -0,0 +1,17 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Default <see cref="IAuditRedactionFailureCounter"/> binding used when the
/// Site Health Monitoring bridge has not been wired yet. Bundle C replaces
/// this registration with the real counter that surfaces in the site health
/// report payload as <c>AuditRedactionFailure</c>.
/// </summary>
public sealed class NoOpAuditRedactionFailureCounter : IAuditRedactionFailureCounter
{
/// <inheritdoc/>
public void Increment()
{
// Intentionally empty — Bundle C overrides this binding with the real
// health-metric counter.
}
}
@@ -0,0 +1,79 @@
using System.Text.RegularExpressions;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// AuditLog-008: minimal always-safe fallback filter used by the writer chain
/// when no <see cref="IAuditPayloadFilter"/> is injected (test composition
/// roots, future composition roots that bypass <c>AddAuditLog</c>). Performs
/// HTTP header redaction for the always-sensitive defaults
/// (Authorization, X-Api-Key, Cookie, Set-Cookie) so a fixture that wires a
/// real <see cref="AuditEvent.RequestSummary"/> never persists those headers
/// in cleartext. Does NOT perform body-regex redaction, SQL-parameter
/// redaction, or truncation — those stages need
/// <see cref="DefaultAuditPayloadFilter"/> with live options. The contract is:
/// over-redact safely, never throw, never miss a header that's on the
/// default sensitive list.
/// </summary>
public sealed class SafeDefaultAuditPayloadFilter : IAuditPayloadFilter
{
/// <summary>Singleton instance — the filter is stateless and side-effect-free.</summary>
public static SafeDefaultAuditPayloadFilter Instance { get; } = new SafeDefaultAuditPayloadFilter();
private static readonly string[] DefaultHeaderRedactList =
{
"Authorization",
"X-Api-Key",
"Cookie",
"Set-Cookie",
};
private static readonly Regex HeaderRegex = new(
@"(?<name>[A-Za-z][A-Za-z0-9\-_]*)\s*:\s*(?<value>[^\r\n]*)",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private SafeDefaultAuditPayloadFilter() { }
/// <inheritdoc />
public AuditEvent Apply(AuditEvent rawEvent)
{
ArgumentNullException.ThrowIfNull(rawEvent);
try
{
return rawEvent with
{
RequestSummary = RedactHeaders(rawEvent.RequestSummary),
ResponseSummary = RedactHeaders(rawEvent.ResponseSummary),
};
}
catch
{
// Over-redact: drop both summaries entirely so a malformed parse
// path never leaks the original. The contract is "never throw."
return rawEvent with
{
RequestSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
ResponseSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
};
}
}
private static string? RedactHeaders(string? summary)
{
if (string.IsNullOrEmpty(summary)) return summary;
return HeaderRegex.Replace(summary, m =>
{
var name = m.Groups["name"].Value;
foreach (var sensitive in DefaultHeaderRedactList)
{
if (string.Equals(name, sensitive, StringComparison.OrdinalIgnoreCase))
{
return $"{name}: [REDACTED]";
}
}
return m.Value;
});
}
}
@@ -0,0 +1,362 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.AuditLog.Site;
using ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
namespace ZB.MOM.WW.ScadaBridge.AuditLog;
/// <summary>
/// Composition root for the Audit Log (#23) component.
/// </summary>
/// <remarks>
/// <para>
/// M1 registered <see cref="AuditLogOptions"/> + the validator. M2 Bundle E
/// extends the surface with the site-side writer chain
/// (<see cref="SqliteAuditWriter"/> + <see cref="RingBufferFallback"/> +
/// <see cref="FallbackAuditWriter"/>) and the telemetry collaborators
/// (<see cref="ISiteAuditQueue"/>, <see cref="ISiteStreamAuditClient"/>,
/// <see cref="IAuditWriteFailureCounter"/>, <see cref="SiteAuditTelemetryOptions"/>,
/// <see cref="SqliteAuditWriterOptions"/>).
/// </para>
/// <para>
/// Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call
/// Audit (#22). <c>IAuditLogRepository</c> is registered by
/// <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.ServiceCollectionExtensions.AddConfigurationDatabase</c>,
/// so the caller (the Host on the central node) must also call that.
/// </para>
/// </remarks>
public static class ServiceCollectionExtensions
{
/// <summary>Configuration section bound to <see cref="AuditLogOptions"/>.</summary>
public const string ConfigSectionName = "AuditLog";
/// <summary>Configuration section bound to <see cref="SqliteAuditWriterOptions"/>.</summary>
public const string SiteWriterSectionName = "AuditLog:SiteWriter";
/// <summary>Configuration section bound to <see cref="SiteAuditTelemetryOptions"/>.</summary>
public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry";
/// <summary>Configuration section bound to <see cref="AuditLogPartitionMaintenanceOptions"/>.</summary>
public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance";
/// <summary>
/// Registers the Audit Log (#23) component services: options, the site
/// SQLite writer chain (primary + ring fallback + failure-counter sink),
/// and the site-→central telemetry collaborators. Idempotent re-registration
/// is not supported; call this exactly once per <see cref="IServiceCollection"/>.
/// </summary>
/// <param name="services">The service collection to register into.</param>
/// <param name="config">Application configuration used to bind <see cref="AuditLogOptions"/> and related options sections.</param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddAuditLog(this IServiceCollection services, IConfiguration config)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(config);
// M1: top-level AuditLogOptions + validator (redaction policy, payload caps, etc.).
services.AddOptions<AuditLogOptions>()
.Bind(config.GetSection(ConfigSectionName))
.ValidateOnStart();
services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();
// M5 Bundle A: payload filter — truncates oversized RequestSummary /
// ResponseSummary / ErrorDetail / Extra fields between event
// construction and persistence. Bundle B layers header / body /
// SQL-parameter redaction onto the same singleton; Bundle C wires it
// into the FallbackAuditWriter / CentralAuditWriter / IngestActor
// paths. Singleton — the filter is stateless and the IOptionsMonitor
// dependency picks up M5-T8 hot reloads on its own.
services.AddSingleton<IAuditPayloadFilter, DefaultAuditPayloadFilter>();
// M5 Bundle B: per-stage redactor-failure counter. NoOp default;
// Bundle C replaces this binding with the Site Health Monitoring
// bridge that surfaces failures as AuditRedactionFailure on the site
// health report.
services.TryAddSingleton<IAuditRedactionFailureCounter, NoOpAuditRedactionFailureCounter>();
// M2 Bundle E: site writer + telemetry options bindings.
// BindConfiguration is not used because the configuration root supplied
// by the caller may not be the application root — we go through the
// section explicitly so a partial IConfiguration (e.g. a test stub
// anchored on the AuditLog section's parent) still works.
services.AddOptions<SqliteAuditWriterOptions>()
.Bind(config.GetSection(SiteWriterSectionName));
services.AddOptions<SiteAuditTelemetryOptions>()
.Bind(config.GetSection(SiteTelemetrySectionName));
// SqliteAuditWriter is a singleton with a single owned SqliteConnection
// and a background writer Task; multiple instances would race on the
// same file. Registered concretely so the ISiteAuditQueue + IAuditWriter
// forwards below resolve to the same instance — the actor must observe
// the writes made via the hot-path interface.
services.AddSingleton<SqliteAuditWriter>();
services.AddSingleton<ISiteAuditQueue>(sp => sp.GetRequiredService<SqliteAuditWriter>());
// RingBufferFallback: drop-oldest in-memory ring used by
// FallbackAuditWriter when the primary SQLite writer throws. Default
// capacity is fine for M2 (1024).
services.AddSingleton<RingBufferFallback>();
// IAuditWriteFailureCounter: NoOp default. Bundle G overrides this
// binding with the real Site Health Monitoring counter. Registered
// before FallbackAuditWriter so the factory can resolve it.
services.AddSingleton<IAuditWriteFailureCounter, NoOpAuditWriteFailureCounter>();
// The script-thread surface is FallbackAuditWriter (primary + ring +
// counter), not the raw SqliteAuditWriter — primary failures must NEVER
// abort the user-facing action.
// Bundle C (M5-T6): the IAuditPayloadFilter singleton above is wired
// through the factory so every event written through this surface is
// truncated + redacted before it hits SQLite (and the ring on
// failure).
services.AddSingleton<IAuditWriter>(sp => new FallbackAuditWriter(
primary: sp.GetRequiredService<SqliteAuditWriter>(),
ring: sp.GetRequiredService<RingBufferFallback>(),
failureCounter: sp.GetRequiredService<IAuditWriteFailureCounter>(),
logger: sp.GetRequiredService<ILogger<FallbackAuditWriter>>(),
filter: sp.GetRequiredService<IAuditPayloadFilter>()));
// ISiteStreamAuditClient: NoOp default. This binding remains correct for
// central/test composition roots that have no SiteCommunicationActor.
// The real implementation is ClusterClientSiteAuditClient, which pushes
// audit telemetry to central over Akka ClusterClient via the site's
// SiteCommunicationActor — the Host wires it directly into the
// SiteAuditTelemetryActor's Props.Create call for site roles (it cannot
// be a DI singleton because it needs the SiteCommunicationActor IActorRef,
// created during Akka bootstrap, not at DI-composition time).
services.AddSingleton<ISiteStreamAuditClient, NoOpSiteStreamAuditClient>();
// M3 Bundle F: site-side dual emitter for cached-call lifecycle
// telemetry. ScriptRuntimeContext.ExternalSystem.CachedCall /
// Database.CachedWrite resolves this through DI and pushes one combined
// packet per lifecycle event; the forwarder writes the audit half
// through IAuditWriter and the operational half through the
// IOperationTrackingStore. The audit writer is always wired (the M2
// chain above); the operational tracking store is SITE-ONLY (registered
// by ZB.MOM.WW.ScadaBridge.SiteRuntime). On a Central composition root the tracking
// store has no registration, so the factory resolves it with GetService
// (returning null) — the forwarder degrades to "audit-only" emission,
// mirroring the lazy IAuditWriter chain established in M2.
services.AddSingleton<ICachedCallTelemetryForwarder>(sp =>
new CachedCallTelemetryForwarder(
sp.GetRequiredService<IAuditWriter>(),
sp.GetService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.IOperationTrackingStore>(),
sp.GetRequiredService<ILogger<CachedCallTelemetryForwarder>>(),
// AuditLog-007: INodeIdentityProvider is now required across
// every consumer in AddAuditLog. The Host's
// SiteServiceRegistration registers it as a singleton on both
// site and central paths (InboundAPI-022 / Host registration
// sweep), and the AddAuditLogTests fixture provides a
// FakeNodeIdentityProvider; a silent GetService() that
// returned null would mask a future composition root that
// forgot to register the provider.
sp.GetRequiredService<INodeIdentityProvider>()));
// M3 Bundle F: bridge the store-and-forward retry-loop observer hook
// to the cached-call forwarder so per-attempt + terminal telemetry
// emitted from the S&F retry sweep lands on the same SQLite hot-path
// as the script-thread CachedSubmit row. Registered as a singleton
// and also bound to ICachedCallLifecycleObserver so AddStoreAndForward
// can resolve it through DI (Bundle F StoreAndForward wiring change).
// SourceNode-stamping (Task 14): factory-resolved so the
// INodeIdentityProvider singleton can be threaded through — the
// bridge stamps SiteCallOperational.SourceNode from
// INodeIdentityProvider.NodeName on every cached-call lifecycle row.
// AuditLog-007: the provider is resolved with GetRequiredService —
// SiteServiceRegistration.BindSharedOptions registers it on both
// site and central paths, so a missing registration is a
// composition-root bug, not a silent null-SourceNode degradation.
services.AddSingleton<CachedCallLifecycleBridge>(sp => new CachedCallLifecycleBridge(
sp.GetRequiredService<ICachedCallTelemetryForwarder>(),
sp.GetRequiredService<ILogger<CachedCallLifecycleBridge>>(),
// AuditLog-007: required, matches the other consumers in this
// composition root — the provider is always registered by
// SiteServiceRegistration.
sp.GetRequiredService<INodeIdentityProvider>()));
services.AddSingleton<ICachedCallLifecycleObserver>(
sp => sp.GetRequiredService<CachedCallLifecycleBridge>());
// M6 Bundle E (T8): central audit-write failure counter — NoOp default
// for site/test composition roots that don't wire the central health
// snapshot. AddAuditLogCentralMaintenance below replaces this binding
// with the AuditCentralHealthSnapshot implementation so increments
// surface on the central dashboard.
services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();
// M4 Bundle B: central direct-write audit writer used by
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
// emit AuditLog rows that originate ON central, not via site telemetry.
// Singleton — the writer is stateless; its per-call scope opens a fresh
// IAuditLogRepository (a SCOPED EF Core service registered by
// ZB.MOM.WW.ScadaBridge.ConfigurationDatabase). The interface (ICentralAuditWriter)
// is intentionally distinct from IAuditWriter so site composition roots
// do not accidentally bind it; central composition roots that include
// AddConfigurationDatabase get a working implementation transparently.
// Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so
// NotificationOutboxActor + Inbound API rows are truncated + redacted
// before they hit MS SQL.
// M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter
// so swallowed repo throws bump the central health counter.
services.AddSingleton<ICentralAuditWriter>(sp => new CentralAuditWriter(
sp,
sp.GetRequiredService<ILogger<CentralAuditWriter>>(),
sp.GetRequiredService<IAuditPayloadFilter>(),
sp.GetRequiredService<ICentralAuditWriteFailureCounter>(),
// SourceNode-stamping (Task 12): wire the local node identity so
// central-origin rows (Notification Outbox dispatch, Inbound API)
// carry the writing node's identifier when the caller hasn't
// already supplied one. GetRequiredService — the production
// composition root in SiteServiceRegistration registers the
// provider as a singleton on both site and central paths.
sp.GetRequiredService<INodeIdentityProvider>()));
return services;
}
/// <summary>
/// Audit Log (#23) M2 Bundle G + M5 Bundle C — swap the default
/// <see cref="NoOpAuditWriteFailureCounter"/> and
/// <see cref="NoOpAuditRedactionFailureCounter"/> registrations for the
/// real <see cref="HealthMetricsAuditWriteFailureCounter"/> /
/// <see cref="HealthMetricsAuditRedactionFailureCounter"/> bridges so the
/// FallbackAuditWriter primary-failure counter AND the
/// DefaultAuditPayloadFilter redactor-failure counter both surface in the
/// site health report payload as
/// <c>SiteHealthReport.SiteAuditWriteFailures</c> +
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// <para>
/// Must be called AFTER both <see cref="AddAuditLog"/> (registers the
/// NoOp defaults this method replaces) and
/// <c>ZB.MOM.WW.ScadaBridge.HealthMonitoring.ServiceCollectionExtensions.AddHealthMonitoring</c>
/// or <c>AddSiteHealthMonitoring</c> (registers the
/// <see cref="ISiteHealthCollector"/> the bridges depend on). Resolving
/// <see cref="IAuditWriteFailureCounter"/> or
/// <see cref="IAuditRedactionFailureCounter"/> without the latter throws
/// <see cref="InvalidOperationException"/> at <c>GetRequiredService</c>
/// time — by design, since a silent NoOp would mask a misconfiguration.
/// </para>
/// <para>
/// Idempotent — a sentinel check on the
/// <see cref="SiteAuditBacklogReporter"/> hosted-service descriptor
/// short-circuits subsequent calls so the hosted service is not
/// double-registered (AddHostedService has no TryAdd variant).
/// </para>
/// <para>
/// Site-side only for M5: the central composition root keeps the NoOp
/// defaults; the central health-metric surface that would expose
/// <c>AuditRedactionFailure</c> next to the existing central counters
/// ships in M6.
/// </para>
/// </remarks>
/// <param name="services">The service collection to register into.</param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddAuditLogHealthMetricsBridge(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
// AuditLog-011: guard against double-registration. AddHostedService is
// additive (no TryAdd variant) so a second call without this sentinel
// would spin up a second SiteAuditBacklogReporter, doubling the 30 s
// SQL probe rate and racing on the ISiteHealthCollector snapshot. The
// SiteAuditBacklogReporter descriptor is the discriminator: it's only
// registered by this helper, so its presence proves the bridge has
// already been wired.
if (services.Any(d => d.ImplementationType == typeof(SiteAuditBacklogReporter)))
{
return services;
}
services.Replace(
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
services.Replace(
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
// M6 Bundle E (T6): the site-side backlog reporter polls the
// SqliteAuditWriter every 30 s and pushes the snapshot into the
// collector so the next SiteHealthReport carries a fresh
// SiteAuditBacklog field. Registered alongside the other site-only
// metric bridges so AddAuditLog (which runs on central too) stays
// free of hosted-service registrations that would resolve a missing
// ISiteHealthCollector on central.
services.AddHostedService<SiteAuditBacklogReporter>();
return services;
}
/// <summary>
/// Audit Log (#23) M6-T5 Bundle D — central-only registration for the
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service plus
/// its <see cref="AuditLogPartitionMaintenanceOptions"/> binding. Must be
/// called from the Central role's composition root (not from a site
/// composition root); the underlying <c>IPartitionMaintenance</c>
/// implementation is registered by <c>AddConfigurationDatabase</c> and
/// only exists on the central node.
/// </summary>
/// <remarks>
/// <para>
/// Separated from <see cref="AddAuditLog"/> because <c>AddAuditLog</c> is
/// also invoked from site composition roots — silently starting a
/// hosted service that resolves an unregistered dependency on a site
/// would fail every tick. Keeping the central-only registration in its
/// own helper preserves the "every <c>Add*</c> call is safe to issue
/// from any composition root" invariant.
/// </para>
/// </remarks>
/// <param name="services">The service collection to register into.</param>
/// <param name="config">Application configuration used to bind partition maintenance options.</param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddAuditLogCentralMaintenance(
this IServiceCollection services,
IConfiguration config)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(config);
services.AddOptions<AuditLogPartitionMaintenanceOptions>()
.Bind(config.GetSection(PartitionMaintenanceSectionName));
services.AddHostedService<AuditLogPartitionMaintenanceService>();
// M6 Bundle E (T8 + T9): central health snapshot — a single object
// that owns the CentralAuditWriteFailures + AuditRedactionFailure
// Interlocked counters AND surfaces them on
// IAuditCentralHealthSnapshot. The same instance is bound to BOTH
// writer-side interfaces (ICentralAuditWriteFailureCounter +
// IAuditRedactionFailureCounter) so every central-side increment
// routes into the shared counters; site nodes keep their existing
// Site bridges (registered by AddAuditLogHealthMetricsBridge) so
// the same counter type does not shadow the site-side metric.
// The snapshot itself has no actor-system dependency — the
// per-site stalled latch is fed by SiteAuditTelemetryStalledTracker
// which the Akka bootstrap wires up after ActorSystem.Create returns
// (the tracker is NOT registered here because its construction
// requires ActorSystem, which is not a DI-resolvable singleton).
services.AddSingleton<AuditCentralHealthSnapshot>();
services.AddSingleton<IAuditCentralHealthSnapshot>(
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>());
services.Replace(ServiceDescriptor.Singleton<ICentralAuditWriteFailureCounter>(
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));
// M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter
// (registered by AddAuditLog) with the CentralAuditRedactionFailureCounter
// bridge so payload-filter throws on CentralAuditWriter /
// AuditLogIngestActor paths surface on the central dashboard. The
// bridge is a thin wrapper around the AuditCentralHealthSnapshot
// singleton so all central redactor failures route into the same
// counter as CentralAuditWriteFailures. The site composition root
// overrides this binding AGAIN via AddAuditLogHealthMetricsBridge —
// central nodes do not call that bridge, so this is the final
// binding on a central host. Mirrors the M5 Bundle C
// HealthMetricsAuditRedactionFailureCounter shape one-for-one.
services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
CentralAuditRedactionFailureCounter>());
return services;
}
}
@@ -0,0 +1,165 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Composes the primary <see cref="SqliteAuditWriter"/> with a drop-oldest
/// <see cref="RingBufferFallback"/>. Audit writes are best-effort by contract
/// (see <see cref="IAuditWriter"/>) — a primary failure must NEVER bubble out
/// to the calling script. Failed events are stashed in the ring; on the next
/// successful primary write the ring is drained back through the primary in
/// FIFO order.
/// </summary>
/// <remarks>
/// <para>
/// Each primary failure increments <see cref="IAuditWriteFailureCounter"/> so
/// Site Health Monitoring can surface a sustained outage as
/// <c>SiteAuditWriteFailures</c> (Bundle G).
/// </para>
/// <para>
/// Errors raised by the ring drain on recovery are logged and silently dropped
/// so we don't loop the failure mode — the trigger event itself succeeded, and
/// retrying the drain on the NEXT successful write is the recovery path.
/// </para>
/// </remarks>
public sealed class FallbackAuditWriter : IAuditWriter
{
private readonly IAuditWriter _primary;
private readonly RingBufferFallback _ring;
private readonly IAuditWriteFailureCounter _failureCounter;
private readonly ILogger<FallbackAuditWriter> _logger;
private readonly IAuditPayloadFilter _filter;
private readonly SemaphoreSlim _drainGate = new(1, 1);
/// <summary>
/// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
/// here so every event written via the site hot path is truncated +
/// header/body/SQL-param redacted before it hits both the primary SQLite
/// writer AND the ring fallback. The parameter is optional (defaults to
/// no filtering) so the long tail of test composition roots that don't
/// care about the filter need no change — the production
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
/// always passes the real filter through.
/// </summary>
/// <param name="primary">The primary audit writer (typically the SQLite writer).</param>
/// <param name="ring">Drop-oldest ring buffer used to stash events when the primary fails.</param>
/// <param name="failureCounter">Counter incremented on each primary failure for health reporting.</param>
/// <param name="logger">Logger for diagnostics.</param>
/// <param name="filter">Optional payload filter applied before writing; null means no filtering.</param>
public FallbackAuditWriter(
IAuditWriter primary,
RingBufferFallback ring,
IAuditWriteFailureCounter failureCounter,
ILogger<FallbackAuditWriter> logger,
IAuditPayloadFilter? filter = null)
{
_primary = primary ?? throw new ArgumentNullException(nameof(primary));
_ring = ring ?? throw new ArgumentNullException(nameof(ring));
_failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
// AuditLog-008: never default to a null filter — over-redact instead.
// SafeDefaultAuditPayloadFilter.Instance performs HTTP header
// redaction with the hard-coded sensitive defaults (Authorization,
// X-Api-Key, Cookie, Set-Cookie) so a test composition root that
// doesn't bind the real options never persists those headers
// verbatim. The real DefaultAuditPayloadFilter (truncation + body /
// SQL-param redaction) is wired by AddAuditLog and takes precedence.
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
}
/// <inheritdoc />
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(evt);
// Filter once, up-front. The filtered event flows BOTH to the primary
// and (on failure) to the ring buffer — so a primary outage that
// drains later still hands the SqliteAuditWriter a row that has
// already been truncated and redacted. The filter contract is
// "MUST NOT throw". AuditLog-008: _filter is now non-null (defaults
// to SafeDefaultAuditPayloadFilter so header redaction is always
// applied even in composition roots that don't wire the real filter).
var filtered = _filter.Apply(evt);
try
{
await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
// Primary down: record the failure, stash in the ring, return
// success to the caller. Audit-write failures NEVER abort the
// user-facing action (alog.md §7). DO NOT attempt the ring drain
// here — primary is throwing, draining would just scramble FIFO
// order across re-enqueues.
_failureCounter.Increment();
_logger.LogWarning(ex,
"Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
filtered.EventId);
// Ring stores the filtered copy so the eventual drain replays a
// payload that has already been capped/redacted — no second
// filter pass needed on recovery, and no risk of the ring
// holding the raw oversized blob in memory.
_ring.TryEnqueue(filtered);
return;
}
// Primary succeeded — opportunistically drain anything that piled up
// in the ring during the outage. Best-effort: a failure during the
// drain re-enqueues the popped event and is logged; the next
// successful write will retry. Drain order in the audit log is
// therefore: <triggering event>, <backlog FIFO>.
if (_ring.Count > 0)
{
await TryDrainRingAsync(ct).ConfigureAwait(false);
}
}
private async Task TryDrainRingAsync(CancellationToken ct)
{
// Serialise drains so two concurrent recoveries don't double-replay.
if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false))
{
return;
}
try
{
// Pull only what is currently buffered; do NOT wait for new events.
// We iterate with a snapshot of Count so we never starve under
// concurrent enqueues.
var pending = _ring.Count;
for (var i = 0; i < pending; i++)
{
if (!_ring.TryDequeue(out var queued))
{
break;
}
try
{
await _primary.WriteAsync(queued, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
// Primary fell over again. Put the event back at the head
// of the queue is impossible with Channel<T>; route to the
// tail (drop-oldest preserves the most-recent picture).
_failureCounter.Increment();
_logger.LogWarning(ex,
"Ring drain re-throw on EventId {EventId}; re-enqueuing.",
queued.EventId);
_ring.TryEnqueue(queued);
break;
}
}
}
finally
{
_drainGate.Release();
}
}
}
@@ -0,0 +1,50 @@
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Audit Log (#23) M5 Bundle C — bridges
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
/// parameter redactor stage throws and the filter has to over-redact the
/// offending field) into <see cref="ISiteHealthCollector"/> so the count
/// surfaces in the site health report payload as
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// <para>
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
/// callers must register <c>AddHealthMonitoring()</c> first so
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="ServiceCollectionExtensions.AddAuditLog"/>
/// registration keeps <see cref="NoOpAuditRedactionFailureCounter"/> for nodes
/// where Site Health Monitoring is not wired (the silent-sink contract —
/// redaction failures must NEVER abort the user-facing action, alog.md §7).
/// </para>
/// <para>
/// Mirrors the M2 Bundle G <see cref="HealthMetricsAuditWriteFailureCounter"/>
/// shape one-for-one so the two health-metric bridges age together.
/// </para>
/// <para>
/// Site-side only for M5: the redaction filter also runs on the central
/// writers (CentralAuditWriter + AuditLogIngestActor), but the central
/// health-metric surface that would expose <c>AuditRedactionFailure</c>
/// alongside the existing central counters ships in M6. Until then, the
/// central composition root keeps the NoOp default — the redactions still
/// happen, they just don't get counted into a health report.
/// </para>
/// </remarks>
public sealed class HealthMetricsAuditRedactionFailureCounter : IAuditRedactionFailureCounter
{
private readonly ISiteHealthCollector _collector;
/// <summary>Initializes the counter with the site health collector it bridges into.</summary>
/// <param name="collector">The site health collector that receives the incremented redaction-failure count.</param>
public HealthMetricsAuditRedactionFailureCounter(ISiteHealthCollector collector)
{
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
}
/// <inheritdoc/>
public void Increment() => _collector.IncrementAuditRedactionFailure();
}
@@ -0,0 +1,37 @@
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Audit Log (#23) M2 Bundle G — bridges <see cref="IAuditWriteFailureCounter"/>
/// (incremented by <see cref="FallbackAuditWriter"/> every time the primary
/// SQLite writer throws) into <see cref="ISiteHealthCollector"/> so the count
/// surfaces in the site health report payload as
/// <c>SiteHealthReport.SiteAuditWriteFailures</c>.
/// </summary>
/// <remarks>
/// <para>
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
/// callers must register <c>AddHealthMonitoring()</c> first so
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="AddAuditLog"/>
/// registration keeps <see cref="NoOpAuditWriteFailureCounter"/> for nodes
/// where Site Health Monitoring is not wired (the silent-sink contract — audit
/// write failures must NEVER abort the user-facing action, alog.md §7).
/// </para>
/// </remarks>
public sealed class HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter
{
private readonly ISiteHealthCollector _collector;
/// <summary>
/// Initializes a new <see cref="HealthMetricsAuditWriteFailureCounter"/> backed by the given health collector.
/// </summary>
/// <param name="collector">The site health collector to increment on each audit write failure.</param>
public HealthMetricsAuditWriteFailureCounter(ISiteHealthCollector collector)
{
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
}
/// <inheritdoc/>
public void Increment() => _collector.IncrementSiteAuditWriteFailures();
}
@@ -0,0 +1,14 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Lightweight counter sink invoked by <see cref="FallbackAuditWriter"/> every
/// time the primary <see cref="SqliteAuditWriter"/> throws on an audit write.
/// Bundle G (M2-T11) implements this as a thread-safe Interlocked counter
/// bridged into the Site Health Monitoring report payload as
/// <c>SiteAuditWriteFailures</c>.
/// </summary>
public interface IAuditWriteFailureCounter
{
/// <summary>Increment the audit-write failure counter by one.</summary>
void Increment();
}
@@ -0,0 +1,25 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Default <see cref="IAuditWriteFailureCounter"/> registered by
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/> on
/// every node. Bundle G replaces this binding with a real counter that bridges
/// into the Site Health Monitoring report payload as
/// <c>SiteAuditWriteFailures</c> — until then,
/// <see cref="FallbackAuditWriter"/> emits to a silent sink rather than NRE-ing
/// on a null collaborator.
/// </summary>
/// <remarks>
/// Audit-write failures must NEVER abort the user-facing action (alog.md §7),
/// so the counter is best-effort by contract. A NoOp default is the correct
/// safe fallback while the health metric is being wired in.
/// </remarks>
public sealed class NoOpAuditWriteFailureCounter : IAuditWriteFailureCounter
{
/// <inheritdoc/>
public void Increment()
{
// Intentionally empty. Bundle G overrides this binding with the real
// counter once Site Health Monitoring is wired.
}
}
@@ -0,0 +1,122 @@
using System.Runtime.CompilerServices;
using System.Threading.Channels;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Drop-oldest in-memory ring buffer used by <see cref="FallbackAuditWriter"/>
/// when the primary SQLite writer is throwing. Capacity is fixed at construction
/// (default 1024). When full, the oldest event is silently dropped to make room
/// for the newest — preserving the most recent picture of activity in the face
/// of an extended SQLite outage — and <see cref="RingBufferOverflowed"/> is
/// raised so a health counter can record the loss.
/// </summary>
/// <remarks>
/// <para>
/// Backed by a <see cref="Channel{T}"/> with
/// <see cref="BoundedChannelFullMode.DropOldest"/>. The channel doesn't natively
/// notify on drop, so this class compares <c>Reader.Count</c> before and after
/// each enqueue: any time we hit capacity and a subsequent enqueue keeps the
/// count at capacity, exactly one event has been dropped.
/// </para>
/// <para>
/// Per the M2 plan: the ring is the absolute-last-resort buffer for the
/// hot-path; it is NOT a substitute for the bounded
/// <see cref="SqliteAuditWriter"/> write queue.
/// </para>
/// </remarks>
public sealed class RingBufferFallback
{
private readonly Channel<AuditEvent> _channel;
private readonly int _capacity;
/// <summary>
/// Raised once each time a drop-oldest overflow occurs. Hooked by
/// <see cref="FallbackAuditWriter"/>'s health counter wiring.
/// </summary>
public event Action? RingBufferOverflowed;
/// <summary>Initializes the ring buffer with the specified fixed capacity.</summary>
/// <param name="capacity">Maximum number of events to buffer; must be greater than zero. Default is 1024.</param>
public RingBufferFallback(int capacity = 1024)
{
if (capacity <= 0)
{
throw new ArgumentOutOfRangeException(nameof(capacity), "capacity must be > 0.");
}
_capacity = capacity;
_channel = Channel.CreateBounded<AuditEvent>(new BoundedChannelOptions(capacity)
{
FullMode = BoundedChannelFullMode.DropOldest,
SingleReader = true,
SingleWriter = false,
});
}
/// <summary>Current event count in the ring (for diagnostics/tests).</summary>
public int Count => _channel.Reader.Count;
/// <summary>
/// Try to enqueue an event. Returns <see langword="true"/> on success (even
/// when an overflow caused an older event to be dropped); returns
/// <see langword="false"/> only when the ring has been
/// <see cref="Complete"/>-d.
/// </summary>
/// <param name="evt">The audit event to enqueue.</param>
/// <returns><see langword="true"/> if enqueued (or enqueued with overflow); <see langword="false"/> when the channel is completed.</returns>
public bool TryEnqueue(AuditEvent evt)
{
ArgumentNullException.ThrowIfNull(evt);
// DropOldest TryWrite always succeeds unless the channel is completed.
// Detect overflow by comparing the count before vs. after: if we were
// already at capacity and remain at capacity, exactly one event was
// dropped to make room for evt.
var beforeCount = _channel.Reader.Count;
if (!_channel.Writer.TryWrite(evt))
{
return false;
}
if (beforeCount >= _capacity)
{
// The new event displaced an existing one.
RingBufferOverflowed?.Invoke();
}
return true;
}
/// <summary>
/// Drain the ring in FIFO order. Yields available events immediately and
/// then completes when the channel is empty AND <see cref="Complete"/> has
/// been called. Callers that only want to drain what's currently buffered
/// must call <see cref="Complete"/> first.
/// </summary>
/// <param name="cancellationToken">Cancellation token to abort the async enumeration.</param>
public async IAsyncEnumerable<AuditEvent> DrainAsync(
[EnumeratorCancellation] CancellationToken cancellationToken)
{
await foreach (var evt in _channel.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
{
yield return evt;
}
}
/// <summary>
/// Non-blocking single-item dequeue used by the
/// <see cref="FallbackAuditWriter"/> recovery path. Returns
/// <see langword="false"/> when the ring is empty.
/// </summary>
/// <param name="evt">When this returns <see langword="true"/>, contains the dequeued event.</param>
/// <returns><see langword="true"/> if an event was dequeued; <see langword="false"/> if the ring is empty.</returns>
public bool TryDequeue(out AuditEvent evt) => _channel.Reader.TryRead(out evt!);
/// <summary>
/// Mark the ring as no-more-writes. <see cref="DrainAsync"/> will yield the
/// remaining events and then complete.
/// </summary>
public void Complete() => _channel.Writer.TryComplete();
}
@@ -0,0 +1,138 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
/// <c>SiteAuditBacklog</c> field on the site health report.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
/// backlog requires the queue's write lock; doing it inline in
/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
/// a synchronous I/O call on the report path. The hosted-service pattern keeps
/// the report path pure and the SQL probe off the report timing budget.
/// </para>
/// <para>
/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
/// across many reports, fine enough that the central dashboard never lags by
/// more than one health-report interval. Tunable via
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
/// if ops needs a different cadence; for M6 we hard-code the value because the
/// brief calls it out explicitly.
/// </para>
/// <para>
/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
/// transient SQLite error never tears down the hosted service — the next tick
/// retries. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
/// "exception logged, not propagated" contract.
/// </para>
/// </remarks>
public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
{
/// <summary>
/// Default poll cadence. Half a typical 60 s health-report interval keeps
/// the snapshot fresh without spinning the SQL probe more often than
/// necessary.
/// </summary>
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
private readonly ISiteAuditQueue _queue;
private readonly ISiteHealthCollector _collector;
private readonly ILogger<SiteAuditBacklogReporter> _logger;
private readonly TimeSpan _refreshInterval;
private CancellationTokenSource? _cts;
private Task? _loop;
/// <summary>Initializes a new instance of <see cref="SiteAuditBacklogReporter"/>.</summary>
/// <param name="queue">The site audit queue used to probe the backlog count.</param>
/// <param name="collector">The site health collector that receives the backlog snapshot.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="refreshInterval">Poll interval override; defaults to <see cref="DefaultRefreshInterval"/> (30 s).</param>
public SiteAuditBacklogReporter(
ISiteAuditQueue queue,
ISiteHealthCollector collector,
ILogger<SiteAuditBacklogReporter> logger,
TimeSpan? refreshInterval = null)
{
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// First tick runs immediately so the very first health report after
// process start carries a real backlog snapshot — without this the
// dashboard would show null for the first 30 s after a deploy.
await SafeProbeAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeProbeAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeProbeAsync(CancellationToken ct)
{
try
{
var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
_collector.UpdateSiteAuditBacklog(snapshot);
}
catch (OperationCanceledException)
{
// Shutdown — let the outer loop exit cleanly.
throw;
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of probe failure (transient SQLite lock contention, disk
// I/O hiccup, …) so the next tick gets a chance.
_logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}
@@ -0,0 +1,913 @@
using System.Threading.Channels;
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Site-side SQLite hot-path writer for Audit Log (#23) events. Mirrors the
/// <see cref="ZB.MOM.WW.ScadaBridge.SiteEventLogging.SiteEventLogger"/> design — a single
/// owned <see cref="SqliteConnection"/> serialised behind a write lock, fed by a
/// bounded <see cref="Channel{T}"/> drained on a dedicated background writer
/// task — so script-thread callers never block on disk I/O.
/// </summary>
/// <remarks>
/// <para>
/// The schema is bootstrapped in the constructor (Bundle B-T1). The
/// Channel-based <see cref="WriteAsync"/> hot-path + Bundle D
/// <see cref="ReadPendingAsync"/> / <see cref="MarkForwardedAsync"/> support
/// surface are wired in Bundle B-T2.
/// </para>
/// <para>
/// Site rows always carry <see cref="AuditForwardState.Pending"/> on first
/// insert; the central row-shape's <c>IngestedAtUtc</c> column does NOT live in
/// the site SQLite schema — central stamps it on ingest.
/// </para>
/// </remarks>
public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable, IDisposable
{
// Microsoft.Data.Sqlite reports a generic SQLITE_CONSTRAINT (error code 19)
// on a PRIMARY KEY violation; the extended subcode 1555 (SQLITE_CONSTRAINT_PRIMARYKEY)
// is exposed via SqliteException.SqliteExtendedErrorCode but isn't reliably
// surfaced across all SQLite builds. We treat any constraint error on insert
// as a duplicate-eventid race and swallow it (first-write-wins) — the index
// on EventId is the only constraint on this table, so this scope is precise.
private const int SqliteErrorConstraint = 19;
private readonly SqliteConnection _connection;
// AuditLog-005: dedicated read-only connection used by GetBacklogStatsAsync,
// ReadPendingAsync, ReadPendingSinceAsync, and ReadForwardedAsync so a slow
// backlog scan (COUNT(*) over hundreds of thousands of Pending rows under a
// central outage) never parks the hot-path writer behind _writeLock.
// SQLite-with-WAL allows a second connection on the same file to read
// concurrently with the writer; the writer's WAL pragma is set in
// InitializeSchema before this connection is opened. The reader connection
// has its own _readLock because SqliteConnection itself is not thread-safe
// even in read-only mode — multiple read callers can otherwise interleave
// commands on the shared connection.
private readonly SqliteConnection _readConnection;
private readonly object _readLock = new();
private readonly SqliteAuditWriterOptions _options;
private readonly ILogger<SqliteAuditWriter> _logger;
private readonly INodeIdentityProvider _nodeIdentity;
private readonly object _writeLock = new();
private readonly Channel<PendingAuditEvent> _writeQueue;
private readonly Task _writerLoop;
private bool _disposed;
/// <summary>Initializes a new instance of the SqliteAuditWriter class.</summary>
/// <param name="options">Configuration options for the audit writer.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="nodeIdentity">Node identity provider.</param>
/// <param name="connectionStringOverride">Optional connection string override.</param>
public SqliteAuditWriter(
IOptions<SqliteAuditWriterOptions> options,
ILogger<SqliteAuditWriter> logger,
INodeIdentityProvider nodeIdentity,
string? connectionStringOverride = null)
{
ArgumentNullException.ThrowIfNull(options);
ArgumentNullException.ThrowIfNull(logger);
ArgumentNullException.ThrowIfNull(nodeIdentity);
_options = options.Value;
_logger = logger;
_nodeIdentity = nodeIdentity;
var connectionString = connectionStringOverride
?? $"Data Source={_options.DatabasePath};Cache=Shared";
_connection = new SqliteConnection(connectionString);
_connection.Open();
InitializeSchema();
// AuditLog-005: open a second connection for read-only callers
// (GetBacklogStatsAsync, ReadPendingAsync, ReadPendingSinceAsync,
// ReadForwardedAsync). InitializeSchema set journal_mode=WAL on the
// writer connection, which is a database-level setting that persists
// for the file — subsequent connections to the same file see WAL and
// can read concurrently with the writer without taking _writeLock.
// Reuse the same connection string so the read connection sees the
// same Data Source / Cache settings as the writer.
_readConnection = new SqliteConnection(connectionString);
_readConnection.Open();
_writeQueue = Channel.CreateBounded<PendingAuditEvent>(
new BoundedChannelOptions(_options.ChannelCapacity)
{
// The hot-path enqueue must back-pressure if the background
// writer falls behind; a higher-level fallback (Bundle B-T4)
// handles truly catastrophic primary failure with a drop-oldest
// ring buffer.
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = false,
});
_writerLoop = Task.Run(ProcessWriteQueueAsync);
}
private void InitializeSchema()
{
// auto_vacuum must be set before any table is created for it to take
// effect on a fresh database. INCREMENTAL lets a future
// `PRAGMA incremental_vacuum` shrink the file after the 7-day retention
// purge — see alog.md §10.
using (var pragmaCmd = _connection.CreateCommand())
{
pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
pragmaCmd.ExecuteNonQuery();
}
// AuditLog-005: enable WAL so a second connection on the same file can
// serve read-only callers (GetBacklogStatsAsync, ReadPendingAsync,
// ReadPendingSinceAsync, ReadForwardedAsync) concurrently with the
// batched writer, decoupling those reads from _writeLock. WAL is a
// database-level setting persisted in the file header; setting it on
// the writer connection means every connection opened to the file
// afterwards inherits WAL behaviour. PRAGMA journal_mode returns the
// mode actually adopted ("memory" for ":memory:" / shared-cache memory
// mode, "wal" for file-backed) — we don't error if WAL was rejected
// because the read connection's correctness does not depend on WAL
// itself, only its concurrency advantage does.
using (var pragmaCmd = _connection.CreateCommand())
{
pragmaCmd.CommandText = "PRAGMA journal_mode = WAL";
pragmaCmd.ExecuteNonQuery();
}
using var cmd = _connection.CreateCommand();
cmd.CommandText = """
CREATE TABLE IF NOT EXISTS AuditLog (
EventId TEXT NOT NULL,
OccurredAtUtc TEXT NOT NULL,
Channel TEXT NOT NULL,
Kind TEXT NOT NULL,
CorrelationId TEXT NULL,
SourceSiteId TEXT NULL,
SourceNode TEXT NULL,
SourceInstanceId TEXT NULL,
SourceScript TEXT NULL,
Actor TEXT NULL,
Target TEXT NULL,
Status TEXT NOT NULL,
HttpStatus INTEGER NULL,
DurationMs INTEGER NULL,
ErrorMessage TEXT NULL,
ErrorDetail TEXT NULL,
RequestSummary TEXT NULL,
ResponseSummary TEXT NULL,
PayloadTruncated INTEGER NOT NULL,
Extra TEXT NULL,
ForwardState TEXT NOT NULL,
ExecutionId TEXT NULL,
ParentExecutionId TEXT NULL,
PRIMARY KEY (EventId)
);
CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
ON AuditLog (ForwardState, OccurredAtUtc);
""";
cmd.ExecuteNonQuery();
// Audit Log #23 (ExecutionId): additively add the ExecutionId column.
// CREATE TABLE IF NOT EXISTS above does NOT add columns to an AuditLog
// table that already exists from a pre-ExecutionId build, so an
// auditlog.db created by an older build needs the column ALTER-ed in.
// The file is durable across restart/failover by design (7-day
// retention), so without this step every WriteAsync on an upgraded
// deployment would bind $ExecutionId against a missing column and the
// best-effort write path would silently drop every site audit row.
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
// probed first and the ALTER skipped when already there. The column is
// nullable with no default, so any row written before this migration
// reads back ExecutionId = null (back-compat).
AddColumnIfMissing("ExecutionId", "TEXT NULL");
// Audit Log #23 (ParentExecutionId): same idempotent upgrade path as
// ExecutionId above. A deployment that already ran the ExecutionId
// branch has an auditlog.db with the 21-column schema and no
// ParentExecutionId column; CREATE TABLE IF NOT EXISTS cannot add it,
// so it is ALTER-ed in here. Nullable with no default — rows written
// before this migration read back ParentExecutionId = null.
AddColumnIfMissing("ParentExecutionId", "TEXT NULL");
// SourceNode stamping: same idempotent upgrade path as ExecutionId /
// ParentExecutionId above. A deployment that already ran the
// ParentExecutionId branch has an auditlog.db with the 22-column
// schema and no SourceNode column; CREATE TABLE IF NOT EXISTS cannot
// add it, so it is ALTER-ed in here. Nullable with no default — rows
// written before this migration read back SourceNode = null.
AddColumnIfMissing("SourceNode", "TEXT NULL");
}
/// <summary>
/// Audit Log #23: additively adds a column to <c>AuditLog</c> only when
/// it is not already present (used for <c>ExecutionId</c> and
/// <c>ParentExecutionId</c>). SQLite lacks <c>ADD COLUMN IF NOT EXISTS</c>,
/// so the schema is probed via <c>PRAGMA table_info</c> first. Idempotent —
/// safe to run on every <see cref="InitializeSchema"/>. Mirrors
/// <c>StoreAndForwardStorage.AddColumnIfMissingAsync</c>; kept synchronous
/// here to match the rest of this writer's bootstrap DDL.
/// </summary>
private void AddColumnIfMissing(string columnName, string columnDefinition)
{
using var probe = _connection.CreateCommand();
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('AuditLog') WHERE name = $name";
probe.Parameters.AddWithValue("$name", columnName);
var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
if (exists)
{
return;
}
using var alter = _connection.CreateCommand();
// Column name + definition are caller-controlled constants, never user
// input — safe to interpolate (parameters are not permitted in DDL).
alter.CommandText = $"ALTER TABLE AuditLog ADD COLUMN {columnName} {columnDefinition}";
alter.ExecuteNonQuery();
}
/// <inheritdoc />
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(evt);
// Site rows always carry a non-null ForwardState; central rows leave it
// null. Force Pending on enqueue so callers can pass a bare AuditEvent
// without thinking about site-vs-central provenance.
var siteEvt = evt.ForwardState is null
? evt with { ForwardState = AuditForwardState.Pending }
: evt;
var pending = new PendingAuditEvent(siteEvt);
// CreateBounded(FullMode=Wait) means WriteAsync will await room rather
// than throw when full — exactly the hot-path back-pressure semantics
// we want.
if (!_writeQueue.Writer.TryWrite(pending))
{
// The writer is either completed (logger disposed) or the channel
// is at capacity. Fall back to the async path which honours the
// FullMode=Wait policy.
return WriteSlowPathAsync(pending, ct);
}
return pending.Completion.Task;
}
private async Task WriteSlowPathAsync(PendingAuditEvent pending, CancellationToken ct)
{
try
{
await _writeQueue.Writer.WriteAsync(pending, ct).ConfigureAwait(false);
}
catch (ChannelClosedException)
{
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SqliteAuditWriter),
"Event could not be recorded: the audit writer has been disposed."));
}
await pending.Completion.Task.ConfigureAwait(false);
}
private async Task ProcessWriteQueueAsync()
{
var batch = new List<PendingAuditEvent>(_options.BatchSize);
// ReadAllAsync completes when the channel is marked complete (Dispose).
await foreach (var first in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
{
batch.Clear();
batch.Add(first);
// Pull additional ready events up to BatchSize. TryRead is non-
// blocking and lets us amortise the transaction overhead across a
// burst of concurrent enqueues.
while (batch.Count < _options.BatchSize &&
_writeQueue.Reader.TryRead(out var next))
{
batch.Add(next);
}
FlushBatch(batch);
}
}
private void FlushBatch(IReadOnlyList<PendingAuditEvent> batch)
{
lock (_writeLock)
{
if (_disposed)
{
foreach (var pending in batch)
{
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SqliteAuditWriter),
"Event could not be recorded: the audit writer was disposed before the write completed."));
}
return;
}
using var transaction = _connection.BeginTransaction();
try
{
using var cmd = _connection.CreateCommand();
cmd.Transaction = transaction;
cmd.CommandText = """
INSERT INTO AuditLog (
EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
) VALUES (
$EventId, $OccurredAtUtc, $Channel, $Kind, $CorrelationId,
$SourceSiteId, $SourceNode, $SourceInstanceId, $SourceScript, $Actor, $Target,
$Status, $HttpStatus, $DurationMs, $ErrorMessage, $ErrorDetail,
$RequestSummary, $ResponseSummary, $PayloadTruncated, $Extra, $ForwardState,
$ExecutionId, $ParentExecutionId
);
""";
var pEventId = cmd.Parameters.Add("$EventId", SqliteType.Text);
var pOccurredAt = cmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
var pChannel = cmd.Parameters.Add("$Channel", SqliteType.Text);
var pKind = cmd.Parameters.Add("$Kind", SqliteType.Text);
var pCorrelationId = cmd.Parameters.Add("$CorrelationId", SqliteType.Text);
var pSourceSiteId = cmd.Parameters.Add("$SourceSiteId", SqliteType.Text);
var pSourceNode = cmd.Parameters.Add("$SourceNode", SqliteType.Text);
var pSourceInstanceId = cmd.Parameters.Add("$SourceInstanceId", SqliteType.Text);
var pSourceScript = cmd.Parameters.Add("$SourceScript", SqliteType.Text);
var pActor = cmd.Parameters.Add("$Actor", SqliteType.Text);
var pTarget = cmd.Parameters.Add("$Target", SqliteType.Text);
var pStatus = cmd.Parameters.Add("$Status", SqliteType.Text);
var pHttpStatus = cmd.Parameters.Add("$HttpStatus", SqliteType.Integer);
var pDurationMs = cmd.Parameters.Add("$DurationMs", SqliteType.Integer);
var pErrorMessage = cmd.Parameters.Add("$ErrorMessage", SqliteType.Text);
var pErrorDetail = cmd.Parameters.Add("$ErrorDetail", SqliteType.Text);
var pRequestSummary = cmd.Parameters.Add("$RequestSummary", SqliteType.Text);
var pResponseSummary = cmd.Parameters.Add("$ResponseSummary", SqliteType.Text);
var pPayloadTruncated = cmd.Parameters.Add("$PayloadTruncated", SqliteType.Integer);
var pExtra = cmd.Parameters.Add("$Extra", SqliteType.Text);
var pForwardState = cmd.Parameters.Add("$ForwardState", SqliteType.Text);
var pExecutionId = cmd.Parameters.Add("$ExecutionId", SqliteType.Text);
var pParentExecutionId = cmd.Parameters.Add("$ParentExecutionId", SqliteType.Text);
foreach (var pending in batch)
{
var e = pending.Event;
pEventId.Value = e.EventId.ToString();
pOccurredAt.Value = e.OccurredAtUtc.ToString("o");
pChannel.Value = e.Channel.ToString();
pKind.Value = e.Kind.ToString();
pCorrelationId.Value = (object?)e.CorrelationId?.ToString() ?? DBNull.Value;
pSourceSiteId.Value = (object?)e.SourceSiteId ?? DBNull.Value;
// SourceNode-stamping: caller-provided value wins (preserves
// rows reconciled in from other nodes via the same writer);
// otherwise stamp from the local INodeIdentityProvider. The
// event record itself is NOT mutated — stamping is at write
// time only. If the provider also returns null (unconfigured
// node), the row's SourceNode stays NULL — operators see
// "needs config" via the schema, not a magic fallback string.
var sourceNode = e.SourceNode ?? _nodeIdentity.NodeName;
pSourceNode.Value = (object?)sourceNode ?? DBNull.Value;
pSourceInstanceId.Value = (object?)e.SourceInstanceId ?? DBNull.Value;
pSourceScript.Value = (object?)e.SourceScript ?? DBNull.Value;
pActor.Value = (object?)e.Actor ?? DBNull.Value;
pTarget.Value = (object?)e.Target ?? DBNull.Value;
pStatus.Value = e.Status.ToString();
pHttpStatus.Value = (object?)e.HttpStatus ?? DBNull.Value;
pDurationMs.Value = (object?)e.DurationMs ?? DBNull.Value;
pErrorMessage.Value = (object?)e.ErrorMessage ?? DBNull.Value;
pErrorDetail.Value = (object?)e.ErrorDetail ?? DBNull.Value;
pRequestSummary.Value = (object?)e.RequestSummary ?? DBNull.Value;
pResponseSummary.Value = (object?)e.ResponseSummary ?? DBNull.Value;
pPayloadTruncated.Value = e.PayloadTruncated ? 1 : 0;
pExtra.Value = (object?)e.Extra ?? DBNull.Value;
pForwardState.Value = (e.ForwardState ?? AuditForwardState.Pending).ToString();
pExecutionId.Value = (object?)e.ExecutionId?.ToString() ?? DBNull.Value;
pParentExecutionId.Value = (object?)e.ParentExecutionId?.ToString() ?? DBNull.Value;
try
{
cmd.ExecuteNonQuery();
pending.Completion.TrySetResult();
}
catch (SqliteException ex) when (ex.SqliteErrorCode == SqliteErrorConstraint)
{
// Duplicate EventId — first-write-wins (alog.md §11).
// Treat as success: the lifecycle event is durably
// recorded under the first writer's payload.
_logger.LogDebug(ex,
"Duplicate EventId {EventId} swallowed by SqliteAuditWriter",
e.EventId);
pending.Completion.TrySetResult();
}
}
transaction.Commit();
}
catch (Exception ex)
{
transaction.Rollback();
_logger.LogError(ex, "SqliteAuditWriter batch insert failed; faulting {Count} pending events", batch.Count);
foreach (var pending in batch)
{
pending.Completion.TrySetException(ex);
}
}
}
}
// AuditLog-001: cached-lifecycle audit kinds that ride the combined-telemetry
// drain (joined with the operational tracking row + pushed via
// IngestCachedTelemetryAsync into the central dual-write transaction).
// ReadPendingAsync EXCLUDES these so the audit-only drain doesn't double-emit
// them; ReadPendingCachedTelemetryAsync below is the dedicated read surface
// the new SiteAuditTelemetryActor cached-drain uses.
private static readonly string[] CachedTelemetryKindNames =
{
nameof(AuditKind.CachedSubmit),
nameof(AuditKind.ApiCallCached),
nameof(AuditKind.DbWriteCached),
nameof(AuditKind.CachedResolve),
};
/// <inheritdoc />
public Task<IReadOnlyList<AuditEvent>> ReadPendingAsync(int limit, CancellationToken ct = default)
{
if (limit <= 0)
{
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
}
// AuditLog-005: read via the dedicated _readConnection so this scan
// (which can be expensive when the backlog grows under a central
// outage) does not block the batched writer on _writeLock. WAL mode
// gives us a stable snapshot of the table while writes proceed on the
// writer connection. _readLock serialises this connection across
// multiple concurrent read callers since SqliteConnection itself is
// not thread-safe.
// AuditLog-001: NOT IN ($cached1,$cached2,$cached3,$cached4) excludes the
// cached-lifecycle kinds — they flow through ReadPendingCachedTelemetryAsync
// + the combined-telemetry drain. Kind is stored as the enum's name (see
// FlushBatch's pKind.Value), so a string-IN against the constant kind
// names matches the on-disk shape exactly.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $pending
AND Kind NOT IN ($k0, $k1, $k2, $k3)
ORDER BY OccurredAtUtc ASC, EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
}
}
/// <inheritdoc />
public Task<IReadOnlyList<AuditEvent>> ReadPendingCachedTelemetryAsync(
int limit, CancellationToken ct = default)
{
if (limit <= 0)
{
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
}
// AuditLog-001: dedicated read surface for the cached-call lifecycle
// drain — symmetric to ReadPendingAsync but filtered to the four
// cached AuditKinds. Same _readConnection + _readLock pattern so the
// hot-path writer is not contended.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $pending
AND Kind IN ($k0, $k1, $k2, $k3)
ORDER BY OccurredAtUtc ASC, EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
}
}
/// <summary>
/// Returns up to <paramref name="limit"/> rows in
/// <see cref="AuditForwardState.Forwarded"/>, oldest
/// <see cref="AuditEvent.OccurredAtUtc"/> first, with
/// <see cref="AuditEvent.EventId"/> as the deterministic tiebreaker. The
/// <see cref="AuditForwardState.Forwarded"/>-specific counterpart of
/// <see cref="ReadPendingAsync"/>; used by tests to assert a row reached the
/// <see cref="AuditForwardState.Forwarded"/> state specifically (unlike
/// <see cref="ReadPendingSinceAsync"/>, which also returns
/// <see cref="AuditForwardState.Pending"/> rows).
/// </summary>
/// <param name="limit">Maximum number of rows to return.</param>
/// <param name="ct">Cancellation token.</param>
public Task<IReadOnlyList<AuditEvent>> ReadForwardedAsync(int limit, CancellationToken ct = default)
{
if (limit <= 0)
{
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
}
// AuditLog-005: mirror ReadPendingAsync — read via _readConnection /
// _readLock so this query never contends with the batched writer on
// _writeLock.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $forwarded
ORDER BY OccurredAtUtc ASC, EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
}
}
/// <inheritdoc />
public Task MarkForwardedAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(eventIds);
if (eventIds.Count == 0)
{
return Task.CompletedTask;
}
lock (_writeLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _connection.CreateCommand();
// Build a single IN (...) parameter list so we issue one UPDATE per
// batch regardless of size. Each id is bound as its own parameter,
// so no string concatenation of user data ever enters the SQL.
var sb = new System.Text.StringBuilder();
sb.Append("UPDATE AuditLog SET ForwardState = $forwarded WHERE EventId IN (");
for (int i = 0; i < eventIds.Count; i++)
{
if (i > 0) sb.Append(',');
var p = $"$id{i}";
sb.Append(p);
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
}
sb.Append(");");
cmd.CommandText = sb.ToString();
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
cmd.ExecuteNonQuery();
return Task.CompletedTask;
}
}
/// <inheritdoc />
public Task<IReadOnlyList<AuditEvent>> ReadPendingSinceAsync(
DateTime sinceUtc, int batchSize, CancellationToken ct = default)
{
if (batchSize <= 0)
{
throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0.");
}
// AuditLog-005: read via _readConnection / _readLock — same lock-
// decoupling as ReadPendingAsync.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState IN ($pending, $forwarded)
AND OccurredAtUtc >= $since
ORDER BY OccurredAtUtc ASC, EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
// Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc
// is stored on insert ("o" format) — string comparison is monotonic for
// that encoding so we can index-scan against it.
cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString(
"o", System.Globalization.CultureInfo.InvariantCulture));
cmd.Parameters.AddWithValue("$limit", batchSize);
var rows = new List<AuditEvent>(Math.Min(batchSize, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
}
}
/// <inheritdoc />
public Task MarkReconciledAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(eventIds);
if (eventIds.Count == 0)
{
return Task.CompletedTask;
}
lock (_writeLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _connection.CreateCommand();
var sb = new System.Text.StringBuilder();
sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ")
.Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
for (int i = 0; i < eventIds.Count; i++)
{
if (i > 0) sb.Append(',');
var p = $"$id{i}";
sb.Append(p);
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
}
sb.Append(");");
cmd.CommandText = sb.ToString();
cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString());
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
cmd.ExecuteNonQuery();
return Task.CompletedTask;
}
}
/// <inheritdoc />
public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
{
int pendingCount;
DateTime? oldestPending;
// AuditLog-005: read via the dedicated _readConnection (under
// _readLock) so this probe — polled every 30 s by SiteAuditBacklogReporter
// — never blocks the batched hot-path writer on _writeLock. Under a
// central outage the Pending backlog can grow to hundreds of thousands
// of rows and the COUNT(*) scan correspondingly stretches; that no
// longer adds tail latency to user-facing audit writes.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
// index makes both aggregates cheap (count is a covering scan, min
// is the first key).
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT COUNT(*), MIN(OccurredAtUtc)
FROM AuditLog
WHERE ForwardState = $pending;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
using var reader = cmd.ExecuteReader();
reader.Read();
pendingCount = reader.GetInt32(0);
oldestPending = reader.IsDBNull(1)
? null
: DateTime.Parse(reader.GetString(1),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.RoundtripKind);
}
// File-size lookup outside the lock — the DatabasePath option is the
// canonical source. The connection-string-override branch (used by
// some tests) keeps the same DatabasePath value, so this works
// uniformly. In-memory / mode=memory paths return 0 because the file
// doesn't exist on disk.
long onDiskBytes = 0;
try
{
if (!string.IsNullOrEmpty(_options.DatabasePath) &&
!_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
!_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
File.Exists(_options.DatabasePath))
{
onDiskBytes = new FileInfo(_options.DatabasePath).Length;
}
}
catch (Exception ex)
{
// File system probe is a best-effort health-metric — never abort
// a backlog snapshot because stat() failed. Log and report 0.
_logger.LogDebug(ex,
"SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
_options.DatabasePath);
}
return Task.FromResult(new SiteAuditBacklogSnapshot(
PendingCount: pendingCount,
OldestPendingUtc: oldestPending,
OnDiskBytes: onDiskBytes));
}
private static DateTime EnsureUtc(DateTime value) =>
value.Kind == DateTimeKind.Utc
? value
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
private static AuditEvent MapRow(SqliteDataReader reader)
{
return new AuditEvent
{
EventId = Guid.Parse(reader.GetString(0)),
OccurredAtUtc = DateTime.Parse(reader.GetString(1),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.RoundtripKind),
Channel = Enum.Parse<AuditChannel>(reader.GetString(2)),
Kind = Enum.Parse<AuditKind>(reader.GetString(3)),
CorrelationId = reader.IsDBNull(4) ? null : Guid.Parse(reader.GetString(4)),
SourceSiteId = reader.IsDBNull(5) ? null : reader.GetString(5),
SourceNode = reader.IsDBNull(6) ? null : reader.GetString(6),
SourceInstanceId = reader.IsDBNull(7) ? null : reader.GetString(7),
SourceScript = reader.IsDBNull(8) ? null : reader.GetString(8),
Actor = reader.IsDBNull(9) ? null : reader.GetString(9),
Target = reader.IsDBNull(10) ? null : reader.GetString(10),
Status = Enum.Parse<AuditStatus>(reader.GetString(11)),
HttpStatus = reader.IsDBNull(12) ? null : reader.GetInt32(12),
DurationMs = reader.IsDBNull(13) ? null : reader.GetInt32(13),
ErrorMessage = reader.IsDBNull(14) ? null : reader.GetString(14),
ErrorDetail = reader.IsDBNull(15) ? null : reader.GetString(15),
RequestSummary = reader.IsDBNull(16) ? null : reader.GetString(16),
ResponseSummary = reader.IsDBNull(17) ? null : reader.GetString(17),
PayloadTruncated = reader.GetInt32(18) != 0,
Extra = reader.IsDBNull(19) ? null : reader.GetString(19),
ForwardState = Enum.Parse<AuditForwardState>(reader.GetString(20)),
ExecutionId = reader.IsDBNull(21) ? null : Guid.Parse(reader.GetString(21)),
ParentExecutionId = reader.IsDBNull(22) ? null : Guid.Parse(reader.GetString(22)),
};
}
/// <summary>
/// Disposes the audit writer and releases resources.
/// </summary>
/// <remarks>
/// AuditLog-006: prefer <see cref="DisposeAsync"/> when possible (DI honours
/// <see cref="IAsyncDisposable"/> on singletons). The sync path remains for
/// callers that only know about <see cref="IDisposable"/> (e.g. legacy
/// composition roots, <c>using</c> statements without <c>await</c>). To
/// avoid the classic sync-over-async deadlock on a captured
/// <see cref="SynchronizationContext"/> (ASP.NET request thread, Akka
/// dispatcher under some configurations), we hop to the thread pool via
/// <see cref="Task.Run(Func{Task})"/> before blocking on the result — the
/// async continuation inside <see cref="DisposeAsync"/> then resumes on a
/// pool thread with no captured context, so <c>GetResult()</c> never waits
/// on the very thread the continuation needs.
/// </remarks>
public void Dispose()
{
Task.Run(async () => await DisposeAsync().ConfigureAwait(false))
.GetAwaiter().GetResult();
}
/// <summary>Asynchronously disposes the audit writer and releases resources.</summary>
public async ValueTask DisposeAsync()
{
Task? writerLoop;
lock (_writeLock)
{
if (_disposed) return;
// Stop accepting new events. Completing the channel writer is the
// shutdown signal: WriteAsync calls observe the completion and
// fault, and the writer loop drains any already-buffered items
// before exiting. _disposed is intentionally NOT set here — it
// flips only after the loop has fully drained (second lock block
// below), so FlushBatch's existing _disposed check guards the
// post-drain window when the connection is about to close.
_writeQueue.Writer.TryComplete();
writerLoop = _writerLoop;
}
// Wait outside the lock — the loop reacquires it for each batch.
try
{
if (writerLoop is not null)
{
await writerLoop.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(false);
}
}
catch (TimeoutException)
{
_logger.LogWarning("SqliteAuditWriter writer loop did not drain within 5s of dispose.");
}
catch (Exception ex)
{
// The loop's per-batch try/catch already routed individual failures
// to pending TCSes; a top-level fault here is unexpected.
_logger.LogError(ex, "SqliteAuditWriter writer loop faulted during dispose.");
}
lock (_writeLock)
{
if (_disposed) return;
_disposed = true;
_connection.Dispose();
}
// AuditLog-005: dispose the dedicated read connection after the writer
// is fully drained and closed. _readLock is taken to fence out any
// in-flight read caller that grabbed the lock before _disposed flipped
// — they observe ObjectDisposedException on the next attempt.
lock (_readLock)
{
_readConnection.Dispose();
}
}
/// <summary>An audit event awaiting persistence by the background writer.</summary>
private sealed class PendingAuditEvent
{
/// <summary>Initializes a new instance of the PendingAuditEvent class.</summary>
/// <param name="evt">The audit event to persist.</param>
public PendingAuditEvent(AuditEvent evt)
{
Event = evt;
Completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
}
/// <summary>The audit event to persist.</summary>
public AuditEvent Event { get; }
/// <summary>Task completion source for write completion signaling.</summary>
public TaskCompletionSource Completion { get; }
}
}
@@ -0,0 +1,27 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Options for the site-side SQLite hot-path audit writer.
/// Mirrors the ZB.MOM.WW.ScadaBridge.SiteEventLogging pattern: a single SQLite connection
/// fed by a background writer task draining a bounded
/// <see cref="System.Threading.Channels.Channel{T}"/> so script-thread enqueues
/// never block on disk I/O.
/// </summary>
public sealed class SqliteAuditWriterOptions
{
/// <summary>SQLite database path (or in-memory URI for tests).</summary>
public string DatabasePath { get; set; } = "auditlog.db";
/// <summary>
/// Capacity of the bounded write queue. Set high enough that ordinary
/// script bursts never fill it; <see cref="System.Threading.Channels.BoundedChannelFullMode.Wait"/>
/// applies when the writer falls behind.
/// </summary>
public int ChannelCapacity { get; set; } = 4096;
/// <summary>Max number of pending events the writer drains in one transaction.</summary>
public int BatchSize { get; set; } = 256;
/// <summary>Soft flush interval the writer enforces when fewer than BatchSize events are queued.</summary>
public int FlushIntervalMs { get; set; } = 50;
}
@@ -0,0 +1,236 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): translates per-attempt
/// notifications from the store-and-forward retry loop into one (or two)
/// <see cref="CachedCallTelemetry"/> packets and pushes them through
/// <see cref="ICachedCallTelemetryForwarder"/>.
/// </summary>
/// <remarks>
/// <para>
/// The S&amp;F loop's <see cref="ICachedCallLifecycleObserver"/> reports a
/// single coarse outcome per attempt; the audit pipeline however models the
/// lifecycle as TWO rows on terminal outcomes — an <c>Attempted</c>
/// (<see cref="AuditKind.ApiCallCached"/> / <see cref="AuditKind.DbWriteCached"/>)
/// row capturing the per-attempt mechanics, plus a <see cref="AuditKind.CachedResolve"/>
/// row marking the terminal state for downstream consumers. The bridge fans
/// out per outcome:
/// </para>
/// <list type="bullet">
/// <item><description><c>TransientFailure</c> -> one Attempted(Failed) row.</description></item>
/// <item><description><c>Delivered</c> -> Attempted(Delivered) + CachedResolve(Delivered).</description></item>
/// <item><description><c>PermanentFailure</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
/// <item><description><c>ParkedMaxRetries</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
/// </list>
/// <para>
/// <b>Best-effort emission (alog.md §7):</b> the bridge itself never throws;
/// the underlying forwarder swallows + logs its own failures.
/// </para>
/// </remarks>
public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
{
private readonly ICachedCallTelemetryForwarder _forwarder;
private readonly ILogger<CachedCallLifecycleBridge> _logger;
/// <summary>
/// SourceNode-stamping (Task 14): the local node identity provider used to
/// stamp <c>SiteCallOperational.SourceNode</c> on every cached-call
/// lifecycle row this bridge emits. Optional — when null (legacy hosts /
/// tests that don't register the provider) SourceNode stays null and
/// central persists the <c>SiteCalls</c> row with SourceNode NULL.
/// </summary>
private readonly INodeIdentityProvider? _nodeIdentity;
/// <summary>Initializes a new <see cref="CachedCallLifecycleBridge"/> with the given telemetry forwarder, logger, and optional node identity provider.</summary>
/// <param name="forwarder">The telemetry forwarder used to ship cached-call lifecycle events to central.</param>
/// <param name="logger">Logger for bridge diagnostics.</param>
/// <param name="nodeIdentity">Optional node identity provider used to stamp <c>SourceNode</c> on emitted telemetry rows.</param>
public CachedCallLifecycleBridge(
ICachedCallTelemetryForwarder forwarder,
ILogger<CachedCallLifecycleBridge> logger,
INodeIdentityProvider? nodeIdentity = null)
{
_forwarder = forwarder ?? throw new ArgumentNullException(nameof(forwarder));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_nodeIdentity = nodeIdentity;
}
/// <inheritdoc/>
public async Task OnAttemptCompletedAsync(
CachedCallAttemptContext context, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(context);
try
{
await EmitAttemptedAsync(context, ct).ConfigureAwait(false);
if (IsTerminal(context.Outcome))
{
await EmitResolveAsync(context, ct).ConfigureAwait(false);
}
}
catch (Exception ex)
{
// Defensive — both EmitX paths call the forwarder which is itself
// best-effort. A throw here is unexpected, but the alog.md §7
// contract requires we never propagate.
_logger.LogWarning(ex,
"CachedCallLifecycleBridge: unexpected throw for {TrackedOperationId} (Outcome {Outcome})",
context.TrackedOperationId, context.Outcome);
}
}
private async Task EmitAttemptedAsync(CachedCallAttemptContext context, CancellationToken ct)
{
// Per-attempt row: kind discriminates channel; status is always
// Attempted regardless of outcome (success vs. failure is captured
// by the companion HttpStatus / ErrorMessage fields, NOT by flipping
// the status — CachedResolve carries the terminal Status). Per the
// M3 brief and alog.md §4.
var kind = ChannelToAttemptKind(context.Channel);
var status = AuditStatus.Attempted;
var packet = BuildPacket(
context,
kind: kind,
status: status,
// Operational status mirror — for the per-attempt row the
// operational state is the running status; the bridge always
// writes "Attempted" so reconciliation can't roll back.
operationalStatus: "Attempted",
terminalAtUtc: null,
lastError: context.LastError,
httpStatus: context.HttpStatus);
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
}
private async Task EmitResolveAsync(CachedCallAttemptContext context, CancellationToken ct)
{
var (auditStatus, operationalStatus) = TerminalOutcomeToStatuses(context.Outcome);
var packet = BuildPacket(
context,
kind: AuditKind.CachedResolve,
status: auditStatus,
operationalStatus: operationalStatus,
terminalAtUtc: context.OccurredAtUtc,
lastError: context.LastError,
httpStatus: context.HttpStatus);
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
}
private CachedCallTelemetry BuildPacket(
CachedCallAttemptContext context,
AuditKind kind,
AuditStatus status,
string operationalStatus,
DateTime? terminalAtUtc,
string? lastError,
int? httpStatus)
{
var channel = ChannelStringToEnum(context.Channel);
return new CachedCallTelemetry(
Audit: new AuditEvent
{
EventId = Guid.NewGuid(),
OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
Channel = channel,
Kind = kind,
CorrelationId = context.TrackedOperationId.Value,
// Audit Log #23 (ExecutionId Task 4): the originating script
// execution's per-run correlation id, threaded through the S&F
// buffer; null on rows buffered before Task 4 (back-compat).
ExecutionId = context.ExecutionId,
// Audit Log #23 (ParentExecutionId Task 6): the spawning
// inbound-API request's ExecutionId, threaded through the S&F
// buffer alongside ExecutionId so the retry-loop cached rows
// correlate back to the cross-execution chain. Null for a
// non-routed run and on rows buffered before Task 6.
ParentExecutionId = context.ParentExecutionId,
SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
SourceInstanceId = context.SourceInstanceId,
// Audit Log #23 (ExecutionId Task 4): SourceScript is now
// threaded through the S&F buffer alongside ExecutionId — the
// retry-loop cached rows carry the same provenance the
// script-side cached rows do. Null on pre-Task-4 buffered rows.
SourceScript = context.SourceScript,
Target = context.Target,
Status = status,
HttpStatus = httpStatus,
DurationMs = context.DurationMs,
ErrorMessage = lastError,
ForwardState = AuditForwardState.Pending,
},
Operational: new SiteCallOperational(
TrackedOperationId: context.TrackedOperationId,
Channel: context.Channel,
Target: context.Target,
SourceSite: context.SourceSite,
// SourceNode-stamping (Task 14): the local cluster node name
// (node-a/node-b on a site). Stamped from the injected
// INodeIdentityProvider; null when no provider was wired so
// central persists SiteCalls.SourceNode as NULL.
SourceNode: _nodeIdentity?.NodeName,
Status: operationalStatus,
RetryCount: context.RetryCount,
LastError: lastError,
HttpStatus: httpStatus,
CreatedAtUtc: DateTime.SpecifyKind(context.CreatedAtUtc, DateTimeKind.Utc),
UpdatedAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
TerminalAtUtc: terminalAtUtc is null
? null
: DateTime.SpecifyKind(terminalAtUtc.Value, DateTimeKind.Utc)));
}
private static AuditKind ChannelToAttemptKind(string channel) => channel switch
{
"ApiOutbound" => AuditKind.ApiCallCached,
"DbOutbound" => AuditKind.DbWriteCached,
// Defensive default — the S&F observer is filtered to cached-call
// categories so this branch shouldn't fire in practice.
_ => AuditKind.ApiCallCached,
};
private static AuditChannel ChannelStringToEnum(string channel) => channel switch
{
"ApiOutbound" => AuditChannel.ApiOutbound,
"DbOutbound" => AuditChannel.DbOutbound,
_ => AuditChannel.ApiOutbound,
};
private static (AuditStatus auditStatus, string operationalStatus) TerminalOutcomeToStatuses(
CachedCallAttemptOutcome outcome) => outcome switch
{
CachedCallAttemptOutcome.Delivered =>
(AuditStatus.Delivered, "Delivered"),
CachedCallAttemptOutcome.PermanentFailure =>
(AuditStatus.Parked, "Parked"),
CachedCallAttemptOutcome.ParkedMaxRetries =>
(AuditStatus.Parked, "Parked"),
// TransientFailure isn't terminal — see IsTerminal — but the switch
// is exhaustive so we route it through Failed for safety.
CachedCallAttemptOutcome.TransientFailure =>
(AuditStatus.Failed, "Failed"),
_ => (AuditStatus.Failed, "Failed"),
};
private static bool IsTerminal(CachedCallAttemptOutcome outcome) => outcome switch
{
CachedCallAttemptOutcome.Delivered => true,
CachedCallAttemptOutcome.PermanentFailure => true,
CachedCallAttemptOutcome.ParkedMaxRetries => true,
CachedCallAttemptOutcome.TransientFailure => false,
_ => false,
};
}
@@ -0,0 +1,194 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Site-side dual emitter for cached-call lifecycle telemetry (Audit Log #23 /
/// M3). Sister to <see cref="SiteAuditTelemetryActor"/>: where the M2 actor
/// drains audit-only events, this forwarder takes a combined
/// <see cref="CachedCallTelemetry"/> packet and fans it out to the two
/// site-local stores in a single call:
/// <list type="bullet">
/// <item><description>The <see cref="AuditEvent"/> row is written via
/// <see cref="IAuditWriter"/> (the site <c>FallbackAuditWriter</c> +
/// <c>SqliteAuditWriter</c> chain established in M2).</description></item>
/// <item><description>The operational <see cref="SiteCallOperational"/> half
/// updates the site-local <c>OperationTracking</c> SQLite store via
/// <see cref="IOperationTrackingStore"/>, with the per-lifecycle method
/// (<c>Enqueue</c> / <c>Attempt</c> / <c>Terminal</c>) selected from the
/// audit row's <see cref="AuditKind"/>.</description></item>
/// </list>
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort contract (alog.md §7):</b> a thrown writer OR a thrown
/// tracking store must never propagate to the calling script. Both emission
/// halves are wrapped in independent try/catch blocks so a SQLite outage on
/// one side cannot starve the other — the failure is logged and the call
/// returns normally.
/// </para>
/// <para>
/// <b>Local-write only — the wire push is the drain actor's job.</b> This
/// forwarder is deliberately synchronous against the two site-local SQLite
/// stores and never pushes to central itself. The site→central transport is
/// now live: <c>ClusterClientSiteAuditClient</c> is the production binding of
/// <see cref="ISiteStreamAuditClient"/> on site roles (with
/// <c>NoOpSiteStreamAuditClient</c> retained only for central/test composition
/// roots). The push happens out-of-band: <see cref="SiteAuditTelemetryActor"/>
/// sweeps the <c>AuditEvent</c> rows this forwarder wrote — they live in SQLite
/// tagged <see cref="AuditForwardState.Pending"/> — and drains them to central
/// via that client. A single drain loop therefore covers both the audit-only
/// emissions and the cached-call emissions this forwarder produces.
/// </para>
/// </remarks>
public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
{
private readonly IAuditWriter _auditWriter;
private readonly IOperationTrackingStore? _trackingStore;
private readonly ILogger<CachedCallTelemetryForwarder> _logger;
/// <summary>
/// SourceNode-stamping (Task 14): local node identity provider used to
/// stamp the tracking-store row's <c>SourceNode</c> column on
/// <c>RecordEnqueueAsync</c>. Optional — when null (legacy / test hosts)
/// the column stays NULL on the tracking row.
/// </summary>
private readonly INodeIdentityProvider? _nodeIdentity;
/// <summary>
/// Construct the forwarder. <paramref name="trackingStore"/> is optional —
/// when null only the audit half of the packet is emitted, which matches
/// the M3 Bundle F composition-root contract on Central nodes: the
/// AuditLog DI surface registers the forwarder unconditionally (mirroring
/// the IAuditWriter chain) but the site-only tracking store has no central
/// registration. Production site nodes wire both — the central lazy
/// resolution is a no-op path kept symmetric with the M2 writer chain.
/// </summary>
/// <param name="auditWriter">Writer used to persist audit events from the telemetry packet.</param>
/// <param name="trackingStore">Optional store for updating operation tracking state; null on central nodes.</param>
/// <param name="logger">Logger for this forwarder.</param>
/// <param name="nodeIdentity">Optional provider of the current node name stamped on emitted rows.</param>
public CachedCallTelemetryForwarder(
IAuditWriter auditWriter,
IOperationTrackingStore? trackingStore,
ILogger<CachedCallTelemetryForwarder> logger,
INodeIdentityProvider? nodeIdentity = null)
{
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_trackingStore = trackingStore;
_nodeIdentity = nodeIdentity;
}
/// <inheritdoc />
public async Task ForwardAsync(CachedCallTelemetry telemetry, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(telemetry);
// Independent try/catch — a thrown audit writer must not prevent the
// tracking-store update from running (and vice-versa). Both halves
// are best-effort.
await TryEmitAuditAsync(telemetry, ct).ConfigureAwait(false);
await TryEmitTrackingAsync(telemetry, ct).ConfigureAwait(false);
}
private async Task TryEmitAuditAsync(CachedCallTelemetry telemetry, CancellationToken ct)
{
try
{
await _auditWriter.WriteAsync(telemetry.Audit, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
// alog.md §7 best-effort contract — log and swallow. The audit
// pipeline's own retry/recovery (RingBufferFallback in the
// FallbackAuditWriter) handles transient writer failures upstream;
// a throw bubbling up here means the writer's own swallow contract
// failed, which is itself best-effort-handled.
_logger.LogWarning(ex,
"CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
}
}
private async Task TryEmitTrackingAsync(CachedCallTelemetry telemetry, CancellationToken ct)
{
if (_trackingStore is null)
{
// No site-local tracking store wired — Central composition root or
// an integration-test host that skipped AddSiteRuntime. Emitting
// through the audit half is still meaningful; the tracking half
// is a no-op rather than an error.
return;
}
try
{
switch (telemetry.Audit.Kind)
{
case AuditKind.CachedSubmit:
// Enqueue — insert-if-not-exists with the operational
// channel as the kind discriminator. RetryCount is fixed
// at 0 by the tracking store's INSERT contract.
// SourceNode-stamping (Task 14): stamp the local node
// name (node-a/node-b) from the injected
// INodeIdentityProvider; null when no provider was wired
// so the tracking row's SourceNode column stays NULL.
await _trackingStore.RecordEnqueueAsync(
telemetry.Operational.TrackedOperationId,
telemetry.Operational.Channel,
telemetry.Operational.Target,
telemetry.Audit.SourceInstanceId,
telemetry.Audit.SourceScript,
sourceNode: _nodeIdentity?.NodeName,
ct).ConfigureAwait(false);
break;
case AuditKind.ApiCallCached:
case AuditKind.DbWriteCached:
// Attempt — advance retry counter + last-error/HTTP-status.
// Terminal rows are guarded by the store's WHERE clause.
await _trackingStore.RecordAttemptAsync(
telemetry.Operational.TrackedOperationId,
telemetry.Operational.Status,
telemetry.Operational.RetryCount,
telemetry.Operational.LastError,
telemetry.Operational.HttpStatus,
ct).ConfigureAwait(false);
break;
case AuditKind.CachedResolve:
// Terminal — first-write-wins on the resolve flip.
await _trackingStore.RecordTerminalAsync(
telemetry.Operational.TrackedOperationId,
telemetry.Operational.Status,
telemetry.Operational.LastError,
telemetry.Operational.HttpStatus,
ct).ConfigureAwait(false);
break;
default:
// Defensive — only the four cached-lifecycle kinds are
// expected on this path. Anything else is logged so a
// mis-routed packet is visible but never crashes the
// forwarder.
_logger.LogWarning(
"CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
telemetry.Audit.Kind, telemetry.Audit.EventId);
break;
}
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"CachedCallTelemetryForwarder: tracking-store emission threw for TrackedOperationId {Id} (Status {Status})",
telemetry.Operational.TrackedOperationId, telemetry.Operational.Status);
}
}
}
@@ -0,0 +1,117 @@
using Akka.Actor;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Production <see cref="ISiteStreamAuditClient"/> binding for site composition
/// roots: pushes audit telemetry to central over Akka <c>ClusterClient</c> via
/// the site's <c>SiteCommunicationActor</c>. The actor forwards the command to
/// <c>/user/central-communication</c> and the central
/// <c>CentralCommunicationActor</c> Asks the <c>AuditLogIngestActor</c> proxy —
/// the same command/control transport notifications already use. Wired by the
/// Host for site roles; central and test composition roots keep the
/// <see cref="NoOpSiteStreamAuditClient"/> DI default (they have no
/// <c>SiteCommunicationActor</c>).
/// </summary>
/// <remarks>
/// <para>
/// <b>Throw-on-failure contract.</b> An Ask timeout or a faulted reply
/// (<see cref="Status.Failure"/>) propagates as a thrown exception out of the
/// <c>Ingest*Async</c> methods — it is NOT caught and turned into an empty ack.
/// The <see cref="SiteAuditTelemetryActor"/> drain loop treats a thrown
/// exception as transient and leaves the rows <c>Pending</c> for the next tick.
/// Swallowing the fault into an empty ack would be indistinguishable from "zero
/// rows accepted" and would silently lose the retry signal. Task 1 confirmed
/// the central receiving end does not collapse an ingest fault into an empty
/// ack either, so a site-side Ask through the whole path faults cleanly on a
/// central-side timeout.
/// </para>
/// <para>
/// The batches arrive as proto DTOs (<see cref="AuditEventBatch"/> /
/// <see cref="CachedTelemetryBatch"/>) because the
/// <see cref="SiteAuditTelemetryActor"/> builds them with
/// <see cref="AuditEventDtoMapper.ToDto"/>. This client converts them back into
/// the <see cref="AuditEvent"/> / <see cref="SiteCall"/> entities the Akka
/// command messages carry — the same DTO→entity translation the
/// <c>SiteStreamGrpcServer</c> performs for the gRPC reconciliation path.
/// </para>
/// </remarks>
public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
{
private readonly IActorRef _siteCommunicationActor;
private readonly TimeSpan _askTimeout;
/// <param name="siteCommunicationActor">
/// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
/// over the registered central ClusterClient and routes the reply back to
/// this client's Ask.
/// </param>
/// <param name="askTimeout">
/// Ask timeout for the round-trip to central. On expiry the Ask throws
/// <see cref="Akka.Actor.AskTimeoutException"/>, which the drain loop treats
/// as transient (rows stay <c>Pending</c>).
/// </param>
public ClusterClientSiteAuditClient(IActorRef siteCommunicationActor, TimeSpan askTimeout)
{
ArgumentNullException.ThrowIfNull(siteCommunicationActor);
_siteCommunicationActor = siteCommunicationActor;
_askTimeout = askTimeout;
}
/// <inheritdoc/>
public async Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(batch);
var events = new List<AuditEvent>(batch.Events.Count);
foreach (var dto in batch.Events)
{
events.Add(AuditEventDtoMapper.FromDto(dto));
}
// Ask<T> throws AskTimeoutException on timeout and rethrows a
// Status.Failure's inner cause — both surface as a thrown exception so
// the drain loop keeps the rows Pending. We deliberately do NOT catch.
var reply = await _siteCommunicationActor
.Ask<IngestAuditEventsReply>(new IngestAuditEventsCommand(events), _askTimeout, ct)
.ConfigureAwait(false);
return ToAck(reply.AcceptedEventIds);
}
/// <inheritdoc/>
public async Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(batch);
var entries = new List<CachedTelemetryEntry>(batch.Packets.Count);
foreach (var packet in batch.Packets)
{
var audit = AuditEventDtoMapper.FromDto(packet.AuditEvent);
var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
entries.Add(new CachedTelemetryEntry(audit, siteCall));
}
// Same throw-on-failure contract as IngestAuditEventsAsync. The reply
// type is IngestCachedTelemetryReply (the central dual-write reply),
// distinct from IngestAuditEventsReply.
var reply = await _siteCommunicationActor
.Ask<IngestCachedTelemetryReply>(new IngestCachedTelemetryCommand(entries), _askTimeout, ct)
.ConfigureAwait(false);
return ToAck(reply.AcceptedEventIds);
}
private static IngestAck ToAck(IReadOnlyList<Guid> acceptedEventIds)
{
var ack = new IngestAck();
foreach (var id in acceptedEventIds)
{
ack.AcceptedEventIds.Add(id.ToString());
}
return ack;
}
}
@@ -0,0 +1,46 @@
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Mockable abstraction over the central site-audit push surface that
/// <see cref="SiteAuditTelemetryActor"/> uses to forward <see cref="AuditEventBatch"/>
/// payloads. The production implementation is
/// <see cref="ClusterClientSiteAuditClient"/> — a ClusterClient-based client,
/// wired in the Host for site roles, that forwards batches to central via the
/// site's <c>SiteCommunicationActor</c>. Unit tests substitute via NSubstitute
/// against this interface so the actor never needs a live transport.
/// </summary>
public interface ISiteStreamAuditClient
{
/// <summary>
/// Forwards <paramref name="batch"/> to the central audit-ingest path. The
/// returned <see cref="IngestAck"/> carries the <c>accepted_event_ids</c>
/// the actor will flip to
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
/// in the site SQLite queue.
/// </summary>
/// <param name="batch">The batch of audit events to forward.</param>
/// <param name="ct">Cancellation token for the operation.</param>
Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
/// <summary>
/// Forwards the combined <see cref="CachedTelemetryBatch"/> (Audit Log #23)
/// to the central cached-telemetry ingest path. Each packet carries both the
/// audit row and the operational <c>SiteCalls</c> upsert; central writes both
/// in a single MS SQL transaction. Returns the same <see cref="IngestAck"/>
/// shape as <see cref="IngestAuditEventsAsync"/> so the site-side forwarder
/// can flip the underlying audit rows to
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
/// once central has acknowledged them.
/// </summary>
/// <remarks>
/// The production <see cref="ClusterClientSiteAuditClient"/> forwards over
/// the ClusterClient transport; the <see cref="NoOpSiteStreamAuditClient"/>
/// DI default (used by central and test composition roots) returns an empty
/// ack so no rows are flipped.
/// </remarks>
/// <param name="batch">The batch of cached-call telemetry packets to forward.</param>
/// <param name="ct">Cancellation token for the operation.</param>
Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
}
@@ -0,0 +1,51 @@
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Default <see cref="ISiteStreamAuditClient"/> registered by
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/>.
/// It is a no-op binding for composition roots that have no
/// <c>SiteCommunicationActor</c> — central and test roots. Site roles override
/// it in the Host with the ClusterClient-based
/// <see cref="ClusterClientSiteAuditClient"/>, which actually forwards audit
/// telemetry to central.
/// </summary>
/// <remarks>
/// <para>
/// Returns an empty <see cref="IngestAck"/> so the
/// <see cref="SiteAuditTelemetryActor"/> doesn't flip any rows to
/// <c>Forwarded</c> when this NoOp is in effect — rows stay <c>Pending</c>
/// until a real client (or a test stub) takes over.
/// </para>
/// <para>
/// Audit-write paths are best-effort by contract: a NoOp client keeps the
/// host running cleanly and is consistent with "audit-write failures never
/// abort the user-facing action".
/// </para>
/// </remarks>
public sealed class NoOpSiteStreamAuditClient : ISiteStreamAuditClient
{
private static readonly IngestAck EmptyAck = new();
/// <inheritdoc/>
public Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(batch);
// Empty ack — no EventIds will be flipped to Forwarded, so rows stay
// Pending until the real ClusterClientSiteAuditClient (or a test stub)
// takes over.
return Task.FromResult(EmptyAck);
}
/// <inheritdoc/>
public Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(batch);
// Empty ack — same rationale as IngestAuditEventsAsync. The site still
// writes the audit + tracking rows to its SQLite stores authoritatively;
// central-side state only materialises once the real
// ClusterClientSiteAuditClient (or a test stub) is wired in.
return Task.FromResult(EmptyAck);
}
}
@@ -0,0 +1,464 @@
using Akka.Actor;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Site-side actor that drains the local SQLite audit queue and pushes Pending
/// rows to central via two parallel transports:
/// <list type="bullet">
/// <item><description><c>IngestAuditEvents</c> for the audit-only path —
/// sync ApiCall/DbWrite, NotifySend, InboundRequest and similar single-row
/// lifecycle events.</description></item>
/// <item><description><c>IngestCachedTelemetry</c> for the combined-telemetry
/// path — cached-call lifecycle rows (<c>CachedSubmit</c>,
/// <c>ApiCallCached</c>/<c>DbWriteCached</c>, <c>CachedResolve</c>) joined
/// with the matching <c>OperationTracking</c> row, written at central as a
/// single dual-write transaction (AuditLog + SiteCalls).</description></item>
/// </list>
/// </summary>
/// <remarks>
/// <para>
/// The drain self-ticks via two private messages — <c>Drain</c> for the
/// audit-only path and <c>CachedDrain</c> for the combined path — each
/// scheduled independently. Cadence is options-driven:
/// <c>BusyIntervalSeconds</c> when the previous drain found rows (or faulted —
/// we want quick recovery), <c>IdleIntervalSeconds</c> when the queue was empty.
/// The two drains share the same cadence configuration but advance their own
/// timers so a stall on one path does not block the other.
/// </para>
/// <para>
/// Collaborators are injected as interfaces (<see cref="ISiteAuditQueue"/>,
/// <see cref="ISiteStreamAuditClient"/>, optional
/// <see cref="IOperationTrackingStore"/>) so unit tests substitute with
/// NSubstitute and never touch real SQLite or gRPC. The
/// <see cref="IOperationTrackingStore"/> is optional — central composition
/// roots and tests that don't exercise the cached path can leave it null, in
/// which case the cached-drain scheduler is never armed.
/// </para>
/// <para>
/// Per Bundle D's brief, audit-write paths must be fail-safe — a thrown
/// exception inside the actor MUST NOT crash it. Both Drain handlers wrap
/// their pipelines in a top-level try/catch that logs and re-schedules; the
/// actor's <see cref="SupervisorStrategy"/> defaults to
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultStrategy"/>'s Restart for
/// child actors — but this actor has no children, so the catch is what
/// matters.
/// </para>
/// <para>
/// AuditLog-001: wires the previously-unreachable combined-telemetry transport.
/// Prior to this the cached audit rows flowed through the audit-only drain via
/// <c>IngestAuditEventsAsync</c> and the central <c>OnCachedTelemetryAsync</c>
/// dual-write handler was dead production code; the operational <c>SiteCalls</c>
/// half was never sent to central.
/// </para>
/// </remarks>
public class SiteAuditTelemetryActor : ReceiveActor
{
private readonly ISiteAuditQueue _queue;
private readonly ISiteStreamAuditClient _client;
private readonly IOperationTrackingStore? _trackingStore;
private readonly SiteAuditTelemetryOptions _options;
private readonly ILogger<SiteAuditTelemetryActor> _logger;
private ICancelable? _pendingTick;
private ICancelable? _pendingCachedTick;
// AuditLog-010: per-actor lifecycle CTS so an in-flight drain (queue read,
// gRPC push, mark-forwarded write) is actually cancelled when the actor is
// stopped — without it, a stuck IngestAuditEventsAsync would hold the
// continuation through CoordinatedShutdown's actor-system terminate window.
// Cancelled in PostStop; never reset (the actor is single-lifetime).
// The same CTS gates the cached-drain pipeline (queue read + tracking
// lookup + gRPC push) so both paths observe shutdown cooperatively.
private readonly CancellationTokenSource _lifecycleCts = new();
/// <summary>Initializes the actor with its drain queue, gRPC client, options, and logger.</summary>
/// <param name="queue">The site-local SQLite audit queue to drain.</param>
/// <param name="client">The gRPC client used to push audit events to central.</param>
/// <param name="options">Telemetry options controlling drain intervals and batch size.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="trackingStore">
/// Optional site-local operation tracking store. When supplied the actor
/// runs the combined-telemetry cached-drain in parallel with the audit-only
/// drain; when null (central composition roots, tests that don't exercise
/// cached calls) the cached scheduler is never armed and only the
/// audit-only drain runs.
/// </param>
public SiteAuditTelemetryActor(
ISiteAuditQueue queue,
ISiteStreamAuditClient client,
IOptions<SiteAuditTelemetryOptions> options,
ILogger<SiteAuditTelemetryActor> logger,
IOperationTrackingStore? trackingStore = null)
{
ArgumentNullException.ThrowIfNull(queue);
ArgumentNullException.ThrowIfNull(client);
ArgumentNullException.ThrowIfNull(options);
ArgumentNullException.ThrowIfNull(logger);
_queue = queue;
_client = client;
_options = options.Value;
_logger = logger;
_trackingStore = trackingStore;
ReceiveAsync<Drain>(_ => OnDrainAsync());
ReceiveAsync<CachedDrain>(_ => OnCachedDrainAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
// Initial ticks fire on the busy interval so both drains start polling
// soon after host startup. A subsequent empty drain will move to the
// idle interval naturally.
ScheduleNext(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
if (_trackingStore is not null)
{
ScheduleNextCached(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
}
}
/// <inheritdoc />
protected override void PostStop()
{
_pendingTick?.Cancel();
_pendingCachedTick?.Cancel();
// AuditLog-010: cancel any in-flight drain so a stuck queue read or
// gRPC push does not hold the continuation past actor stop.
try
{
_lifecycleCts.Cancel();
}
catch (ObjectDisposedException)
{
// PostStop may run after a prior Dispose path — benign.
}
_lifecycleCts.Dispose();
base.PostStop();
}
private async Task OnDrainAsync()
{
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
// AuditLog-010: route every async dependency call through the
// per-actor lifecycle token so PostStop cancellation actually
// propagates into the queue read, the gRPC push, and the
// mark-forwarded write. OperationCanceledException is swallowed by
// the catch-all below.
var ct = _lifecycleCts.Token;
try
{
var pending = await _queue.ReadPendingAsync(_options.BatchSize, ct)
.ConfigureAwait(false);
if (pending.Count == 0)
{
// No rows — settle into the idle cadence until the next write
// bumps us back into the busy cadence.
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
return;
}
var batch = BuildBatch(pending);
IngestAck ack;
try
{
ack = await _client.IngestAuditEventsAsync(batch, ct)
.ConfigureAwait(false);
}
catch (Exception ex)
{
// gRPC fault — leave the rows in Pending so the next drain
// retries. Bundle D's brief: "On gRPC exception (any), log
// Warning, schedule next Drain in BusyIntervalSeconds."
_logger.LogWarning(ex,
"IngestAuditEvents push failed for {Count} pending events; will retry next drain.",
pending.Count);
return;
}
var acceptedIds = ParseAcceptedIds(ack);
if (acceptedIds.Count > 0)
{
await _queue.MarkForwardedAsync(acceptedIds, ct)
.ConfigureAwait(false);
}
}
catch (Exception ex)
{
// Catch-all so a SQLite hiccup or mapper bug never crashes the
// actor. The next tick is still scheduled in the finally block.
_logger.LogError(ex, "Unexpected error during audit-log telemetry drain.");
}
finally
{
// AuditLog-010: if the actor is already shutting down, do not
// arm another tick — the scheduler would fire after PostStop and
// the message would land in dead letters.
if (!_lifecycleCts.IsCancellationRequested)
{
ScheduleNext(nextDelay);
}
}
}
/// <summary>
/// AuditLog-001: combined-telemetry drain. Reads cached-lifecycle audit
/// rows, joins each with the matching <see cref="IOperationTrackingStore"/>
/// snapshot, builds a <see cref="CachedTelemetryBatch"/>, and pushes via
/// <see cref="ISiteStreamAuditClient.IngestCachedTelemetryAsync"/>. Rows
/// whose tracking snapshot is missing (race with retention purge / late
/// audit row) are logged + skipped — the operational half will be
/// re-emitted on the next lifecycle event, and the audit row stays
/// <see cref="Commons.Types.Enums.AuditForwardState.Pending"/> so a later
/// drain (or reconciliation pull) can revisit it.
/// </summary>
private async Task OnCachedDrainAsync()
{
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
var ct = _lifecycleCts.Token;
try
{
// _trackingStore is non-null by construction here — the cached
// scheduler is only armed when it was supplied (see PreStart).
// Defensive check kept for clarity and to silence the compiler's
// null-flow analysis.
if (_trackingStore is null)
{
return;
}
var pending = await _queue
.ReadPendingCachedTelemetryAsync(_options.BatchSize, ct)
.ConfigureAwait(false);
if (pending.Count == 0)
{
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
return;
}
var batch = new CachedTelemetryBatch();
var emittedEventIds = new List<Guid>(pending.Count);
foreach (var auditRow in pending)
{
if (auditRow.CorrelationId is null)
{
// CorrelationId carries the TrackedOperationId for cached
// rows — see CachedCallLifecycleBridge.BuildPacket. Without
// it we can't look up the tracking row; log + skip so the
// bad row doesn't block the rest of the batch. The audit
// row stays Pending (still not in emittedEventIds) and
// central reconciliation will pick it up.
_logger.LogWarning(
"Cached-telemetry drain: audit row {EventId} ({Kind}) has no CorrelationId; skipping.",
auditRow.EventId, auditRow.Kind);
continue;
}
TrackingStatusSnapshot? snapshot;
try
{
snapshot = await _trackingStore
.GetStatusAsync(new TrackedOperationId(auditRow.CorrelationId.Value), ct)
.ConfigureAwait(false);
}
catch (Exception ex)
{
// A tracking-store throw must NOT abort the rest of the
// batch — the audit half is best-effort. Log and skip
// this row; it stays Pending for the next drain.
_logger.LogWarning(ex,
"Cached-telemetry drain: tracking lookup threw for {EventId} (TrackedOperationId {Tid}); skipping.",
auditRow.EventId, auditRow.CorrelationId);
continue;
}
if (snapshot is null)
{
// No tracking row — possible if the audit row is older
// than the tracking retention window, or the tracking
// store was reset. The audit half remains valid and will
// be picked up by central reconciliation; skip the
// combined push for this row.
_logger.LogWarning(
"Cached-telemetry drain: no tracking snapshot for {EventId} (TrackedOperationId {Tid}); skipping.",
auditRow.EventId, auditRow.CorrelationId);
continue;
}
var packet = BuildCachedPacket(auditRow, snapshot);
batch.Packets.Add(packet);
emittedEventIds.Add(auditRow.EventId);
}
if (batch.Packets.Count == 0)
{
// Every row in this read was skipped (no CorrelationId / no
// tracking snapshot). Leave them Pending and try again next
// drain — the underlying race normally resolves on its own.
return;
}
IngestAck ack;
try
{
ack = await _client.IngestCachedTelemetryAsync(batch, ct)
.ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"IngestCachedTelemetry push failed for {Count} cached events; will retry next drain.",
batch.Packets.Count);
return;
}
var acceptedIds = ParseAcceptedIds(ack);
if (acceptedIds.Count > 0)
{
await _queue.MarkForwardedAsync(acceptedIds, ct)
.ConfigureAwait(false);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Unexpected error during cached-telemetry drain.");
}
finally
{
if (!_lifecycleCts.IsCancellationRequested && _trackingStore is not null)
{
ScheduleNextCached(nextDelay);
}
}
}
private static AuditEventBatch BuildBatch(IReadOnlyList<AuditEvent> events)
{
var batch = new AuditEventBatch();
foreach (var e in events)
{
batch.Events.Add(AuditEventDtoMapper.ToDto(e));
}
return batch;
}
/// <summary>
/// AuditLog-001: build the combined wire packet from one cached audit row
/// + its matching operational tracking snapshot. The operational state
/// reflects the latest tracking row at emission time (not the per-event
/// status the audit row implies) because central's <c>SiteCalls</c>
/// upsert is monotonic — it never rolls back. The audit row preserves
/// per-event lifecycle granularity for the audit trail.
/// </summary>
private static CachedTelemetryPacket BuildCachedPacket(
AuditEvent auditRow, TrackingStatusSnapshot snapshot)
{
var sourceSite = auditRow.SourceSiteId ?? string.Empty;
// Channel string form mirrors the AuditChannel-to-string convention used
// by SiteCallOperational + CachedCallLifecycleBridge.BuildPacket.
var channelString = auditRow.Channel.ToString();
var target = auditRow.Target ?? snapshot.TargetSummary ?? string.Empty;
var operationalDto = new SiteCallOperationalDto
{
TrackedOperationId = snapshot.Id.Value.ToString("D"),
Channel = channelString,
Target = target,
SourceSite = sourceSite,
SourceNode = snapshot.SourceNode ?? string.Empty,
Status = snapshot.Status,
RetryCount = snapshot.RetryCount,
LastError = snapshot.LastError ?? string.Empty,
CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.CreatedAtUtc)),
UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.UpdatedAtUtc)),
};
if (snapshot.HttpStatus.HasValue)
{
operationalDto.HttpStatus = snapshot.HttpStatus.Value;
}
if (snapshot.TerminalAtUtc.HasValue)
{
operationalDto.TerminalAtUtc =
Timestamp.FromDateTime(EnsureUtc(snapshot.TerminalAtUtc.Value));
}
return new CachedTelemetryPacket
{
AuditEvent = AuditEventDtoMapper.ToDto(auditRow),
Operational = operationalDto,
};
}
private static DateTime EnsureUtc(DateTime value) =>
value.Kind == DateTimeKind.Utc
? value
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
private static IReadOnlyList<Guid> ParseAcceptedIds(IngestAck ack)
{
if (ack.AcceptedEventIds.Count == 0)
{
return Array.Empty<Guid>();
}
var list = new List<Guid>(ack.AcceptedEventIds.Count);
foreach (var raw in ack.AcceptedEventIds)
{
if (Guid.TryParse(raw, out var id))
{
list.Add(id);
}
// Malformed ids are ignored — central should never emit them, but
// we refuse to crash the actor over a bad string.
}
return list;
}
private void ScheduleNext(TimeSpan delay)
{
_pendingTick?.Cancel();
_pendingTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
delay,
Self,
Drain.Instance,
Self);
}
private void ScheduleNextCached(TimeSpan delay)
{
_pendingCachedTick?.Cancel();
_pendingCachedTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
delay,
Self,
CachedDrain.Instance,
Self);
}
/// <summary>Self-tick message that triggers an audit-only drain cycle.</summary>
private sealed class Drain
{
public static readonly Drain Instance = new();
private Drain() { }
}
/// <summary>
/// Self-tick message that triggers a combined-telemetry drain cycle.
/// AuditLog-001: introduced alongside the cached-drain to keep the two
/// paths' cadences independent — a stall on one does not block the other.
/// </summary>
private sealed class CachedDrain
{
public static readonly CachedDrain Instance = new();
private CachedDrain() { }
}
}
@@ -0,0 +1,28 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
/// <summary>
/// Tuning knobs for the site-side <see cref="SiteAuditTelemetryActor"/> drain
/// loop. Defaults mirror Bundle D's plan: drain every 5 s while rows are
/// flowing (busy), every 30 s when the queue is empty (idle).
/// </summary>
public sealed class SiteAuditTelemetryOptions
{
/// <summary>
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
/// rows read from the site SQLite queue and pushed in a single gRPC batch.
/// </summary>
public int BatchSize { get; set; } = 256;
/// <summary>
/// Delay between drains when the previous drain found at least one Pending
/// row OR the previous push faulted. Re-drain quickly to keep telemetry
/// flowing and to retry transient gRPC errors.
/// </summary>
public int BusyIntervalSeconds { get; set; } = 5;
/// <summary>
/// Delay between drains when the previous drain found no Pending rows.
/// Longer interval avoids hammering an idle SQLite + gRPC channel.
/// </summary>
public int IdleIntervalSeconds { get; set; } = 30;
}
@@ -0,0 +1,35 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<!-- Bundle D D1: SiteAuditTelemetryActor + (D2) AuditLogIngestActor live
in this project, so Akka is an explicit dependency. -->
<PackageReference Include="Akka" />
<PackageReference Include="Microsoft.Data.Sqlite" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
<!-- Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call Audit (#22).
IAuditLogRepository is registered by ZB.MOM.WW.ScadaBridge.ConfigurationDatabase; the project
reference is documented here so M2 writers + telemetry actors can depend on it. -->
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.csproj" />
<!-- Communication carries the IngestAuditEvents proto + DTOs (#23 M2 site sync). -->
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Communication/ZB.MOM.WW.ScadaBridge.Communication.csproj" />
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.AuditLog.Tests" />
</ItemGroup>
</Project>