refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj,
namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated.
ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated.
SQL roles/logins, LDAP domains, CLI command name, and CLI config dir
(~/.scadalink → ~/.scadabridge) also renamed.

Build green; 5 Host.Tests fail awaiting SQL login rename in next commit.
Pre-existing StaleTagMonitor timing flakes unchanged.

Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
Joseph Doherty
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,81 @@
using System.Collections.Concurrent;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of
/// <see cref="IAuditCentralHealthSnapshot"/>. Owns thread-safe
/// <see cref="System.Threading.Interlocked"/> counters for
/// <c>CentralAuditWriteFailures</c> + <c>AuditRedactionFailure</c> and a
/// per-site latched stalled-state map fed by the
/// <see cref="SiteAuditTelemetryStalledTracker"/>. Also implements the
/// writer surfaces (<see cref="ICentralAuditWriteFailureCounter"/> +
/// <see cref="IAuditRedactionFailureCounter"/>) so a single concrete object
/// is the source of truth — DI binds those two interfaces to this same
/// singleton instance on the central composition root.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why one type for read + write.</b> The writer interfaces are tiny
/// (<c>Increment()</c>) and the read surface needs visibility of those
/// counters anyway — having a single class own both means the
/// <c>Interlocked</c> field IS the snapshot value, no extra plumbing needed.
/// Mirrors the
/// <see cref="ZB.MOM.WW.ScadaBridge.HealthMonitoring.SiteHealthCollector"/> pattern where
/// the collector both receives and exposes the metric.
/// </para>
/// <para>
/// <b>Stalled-state plumbing.</b> The per-site stalled latch lives directly
/// on this snapshot. <see cref="SiteAuditTelemetryStalledTracker"/> is the
/// EventStream subscriber that pushes
/// <see cref="SiteAuditTelemetryStalledChanged"/> publications in via
/// <see cref="ApplyStalled"/>. Keeping the dictionary on this type (rather
/// than reading the tracker on every access) lets the snapshot be constructed
/// without an <see cref="Akka.Actor.ActorSystem"/> dependency — the tracker
/// is wired up later from the Akka bootstrap, once the system is built.
/// </para>
/// </remarks>
public sealed class AuditCentralHealthSnapshot
: IAuditCentralHealthSnapshot,
ICentralAuditWriteFailureCounter,
IAuditRedactionFailureCounter
{
private int _centralAuditWriteFailures;
private int _auditRedactionFailure;
private readonly ConcurrentDictionary<string, bool> _stalled = new();
/// <inheritdoc/>
public int CentralAuditWriteFailures =>
Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0);
/// <inheritdoc/>
public int AuditRedactionFailure =>
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
/// <inheritdoc/>
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
new Dictionary<string, bool>(_stalled);
/// <summary>
/// Apply a <see cref="SiteAuditTelemetryStalledChanged"/> publication
/// observed by <see cref="SiteAuditTelemetryStalledTracker"/>. Public
/// so the tracker (which lives in the same assembly but is constructed
/// later from the Akka host) can push without a friend reference;
/// readers should call <see cref="SiteAuditTelemetryStalled"/>.
/// </summary>
/// <param name="evt">The event carrying the site ID and new stalled state.</param>
public void ApplyStalled(SiteAuditTelemetryStalledChanged evt)
{
if (evt is null) return;
_stalled[evt.SiteId] = evt.Stalled;
}
/// <inheritdoc/>
void ICentralAuditWriteFailureCounter.Increment() =>
Interlocked.Increment(ref _centralAuditWriteFailures);
/// <inheritdoc/>
void IAuditRedactionFailureCounter.Increment() =>
Interlocked.Increment(ref _auditRedactionFailure);
}
@@ -0,0 +1,306 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central-side singleton (per Bundle E wiring) that ingests batches of
/// <see cref="AuditEvent"/> rows pushed from sites via the
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
/// silently swallowed (first-write-wins per Bundle A's hardening).
/// </summary>
/// <remarks>
/// <para>
/// Idempotency is the contract: a row that already exists at central counts
/// as "accepted" for the purposes of the reply, because the storage state is
/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
/// </para>
/// <para>
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
/// action. The actor wraps each repository call in its own try/catch so a
/// single bad row cannot cause the rest of the batch to be lost — that
/// per-row catch is what keeps this actor alive across handler throws, not
/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
/// returns the Akka default decider (Restart for most exceptions) and
/// governs children only; this actor has no children today, so the override
/// is a forward-compat placeholder.
/// </para>
/// <para>
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
/// end to end), while Bundle E's host wiring registers the actor as a cluster
/// singleton and must therefore resolve the repository — which is a scoped EF
/// Core service — from a fresh DI scope per message. Mirroring the Notification
/// Outbox actor's pattern.
/// </para>
/// </remarks>
public class AuditLogIngestActor : ReceiveActor
{
private readonly IServiceProvider? _serviceProvider;
private readonly IAuditLogRepository? _injectedRepository;
private readonly ILogger<AuditLogIngestActor> _logger;
/// <summary>
/// Test-mode constructor — injects a concrete repository instance whose
/// lifetime exceeds the test, so the actor reuses the same instance across
/// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
/// </summary>
/// <param name="repository">Audit log repository instance shared across all messages.</param>
/// <param name="logger">Logger for ingest diagnostics.</param>
public AuditLogIngestActor(
IAuditLogRepository repository,
ILogger<AuditLogIngestActor> logger)
{
ArgumentNullException.ThrowIfNull(repository);
ArgumentNullException.ThrowIfNull(logger);
_injectedRepository = repository;
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
// The single-repository test ctor cannot service the M3 dual-write —
// it has no SiteCalls repo and no DbContext. The handler still
// registers (so callers don't dead-letter) but replies empty so the
// test surface stays explicit about what this ctor supports.
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
}
/// <summary>
/// Production constructor — resolves <see cref="IAuditLogRepository"/> from
/// a fresh DI scope per message because the repository is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The actor itself
/// is a long-lived cluster singleton, so it cannot hold a scope across
/// messages.
/// </summary>
/// <param name="serviceProvider">Root service provider used to open a fresh scope per message.</param>
/// <param name="logger">Logger for ingest diagnostics.</param>
public AuditLogIngestActor(
IServiceProvider serviceProvider,
ILogger<AuditLogIngestActor> logger)
{
ArgumentNullException.ThrowIfNull(serviceProvider);
ArgumentNullException.ThrowIfNull(logger);
_serviceProvider = serviceProvider;
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
Akka.Actor.SupervisorStrategy.DefaultDecider);
}
private async Task OnIngestAsync(IngestAuditEventsCommand cmd)
{
// Sender is captured before the first await — Akka resets Sender
// between message dispatches, so a post-await Tell would go to
// DeadLetters.
var replyTo = Sender;
var nowUtc = DateTime.UtcNow;
var accepted = new List<Guid>(cmd.Events.Count);
// Resolve the repository for the whole batch — one DbContext per
// message, mirroring NotificationOutboxActor. The injected-repository
// mode (Bundle D tests) skips the scope entirely.
// Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
// per-message scope when one is available so the row is truncated +
// redacted before InsertIfNotExistsAsync. The single-repository test
// ctor has no service provider — it falls through with no filter,
// which preserves the small-payload assumptions baked into the
// existing D2 fixtures.
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
if (_injectedRepository is not null)
{
await IngestWithRepositoryAsync(_injectedRepository, filter: null, failureCounter: null, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
else
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
// M6 Bundle E (T8): central health counter is best-effort —
// unregistered (test composition roots) means the per-row catch
// simply logs without surfacing on the health dashboard.
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
await IngestWithRepositoryAsync(repository, filter, failureCounter, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
replyTo.Tell(new IngestAuditEventsReply(accepted));
}
private async Task IngestWithRepositoryAsync(
IAuditLogRepository repository,
IAuditPayloadFilter? filter,
ICentralAuditWriteFailureCounter? failureCounter,
IngestAuditEventsCommand cmd,
DateTime nowUtc,
List<Guid> accepted)
{
foreach (var evt in cmd.Events)
{
try
{
// Stamp IngestedAtUtc here, not at the site. Bundle A's
// repository hardening already swallows duplicate-key races,
// so the same id arriving twice (site retry, reconciliation)
// is a silent no-op.
// Filter BEFORE the IngestedAtUtc stamp so the redacted
// copy carries the central-side ingest timestamp. Filter
// is contract-bound to never throw. AuditLog-008: a null
// filter (test composition root, no IAuditPayloadFilter
// registered) now falls back to the SafeDefault rather than
// pass-through, so HTTP header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filtered = safeFilter.Apply(evt);
var ingested = filtered with { IngestedAtUtc = nowUtc };
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
accepted.Add(evt.EventId);
}
catch (Exception ex)
{
// Per-row catch — one bad row never sinks the whole batch.
// The row stays Pending at the site; the next drain retries.
// M6 Bundle E (T8): bump the central health counter so a
// sustained insert-throw failure surfaces on the dashboard.
try { failureCounter?.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogError(ex,
"Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
evt.EventId);
}
}
}
/// <summary>
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
/// or both roll back, so the audit and operational mirrors never drift
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
/// downstream join lines up cleanly.
/// </summary>
/// <remarks>
/// Per-entry isolation — one entry's failed transaction does NOT abort
/// other entries in the batch (each gets its own
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
/// user-facing action — the site keeps the row Pending and retries on the
/// next drain.
/// </remarks>
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
{
var replyTo = Sender;
var accepted = new List<Guid>(cmd.Entries.Count);
try
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
// Bundle C (M5-T6): resolve the filter for the whole batch from
// the scope; null = pass-through for test composition roots that
// skip the filter registration. The filter is contract-bound to
// never throw, so we can apply it inside the per-entry try
// without risking an unbounded blast radius.
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
// M6 Bundle E (T8): same best-effort central health counter as
// the OnIngestAsync path — null on test composition roots that
// skip the registration.
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
foreach (var entry in cmd.Entries)
{
try
{
await using var tx = await dbContext.Database
.BeginTransactionAsync()
.ConfigureAwait(false);
// Stamp IngestedAtUtc on both rows from a single
// central-side instant so a join on the two tables sees
// matching timestamps (debugging convenience, not a
// correctness invariant).
var ingestedAt = DateTime.UtcNow;
// Filter the audit half BEFORE the dual-write — only the
// AuditLog row's payload columns are filterable; SiteCalls
// carries operational state only (status, retry count) and
// is left untouched. AuditLog-008: null filter falls back
// to SafeDefault so header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filteredAudit = safeFilter.Apply(entry.Audit);
var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
await auditRepo.InsertIfNotExistsAsync(auditStamped)
.ConfigureAwait(false);
await siteCallRepo.UpsertAsync(siteCallStamped)
.ConfigureAwait(false);
await tx.CommitAsync().ConfigureAwait(false);
accepted.Add(entry.Audit.EventId);
}
catch (Exception ex)
{
// Both rows rolled back via the disposing transaction. The
// EventId is NOT added to `accepted` so the site keeps its
// row Pending and retries on the next drain. Other entries
// in the batch continue with their own transactions.
// M6 Bundle E (T8): bump the central health counter so a
// sustained dual-write failure surfaces on the dashboard.
try { failureCounter?.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogError(
ex,
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
entry.Audit.EventId,
entry.SiteCall.TrackedOperationId);
}
}
}
catch (Exception ex)
{
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
// reply with whatever we managed to accept (likely empty) — the
// central singleton MUST stay alive.
_logger.LogError(
ex,
"Combined telemetry batch ingest failed before per-entry processing.");
}
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
}
/// <summary>
/// Fallback handler installed on the single-repository test ctor — that
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
/// it cannot service the dual-write. Logs a warning and replies with an
/// empty ack so callers fall through to their retry path.
/// </summary>
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
{
_logger.LogWarning(
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
cmd.Entries.Count);
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
return Task.CompletedTask;
}
}
@@ -0,0 +1,37 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
/// Defaults: once every 24 hours, keep at least one future monthly
/// boundary ahead of <see cref="DateTime.UtcNow"/>.
/// </summary>
/// <remarks>
/// <para>
/// The hosted service drives a daily roll-forward of
/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
/// SPLITs new monthly boundaries until at least
/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
/// default is intentionally conservative — anything less risks an
/// end-of-month race where inserts land in the unbounded tail partition;
/// anything more wastes nothing but represents premature commitment.
/// </para>
/// <para>
/// The 24-hour cadence is the cheapest interval that still guarantees
/// at-most-one missed boundary in steady state (even a hard failover the
/// hosted service can recover on its very next tick). Lowering this below
/// an hour would generate more metadata churn than it saves.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceOptions
{
/// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
public int IntervalSeconds { get; set; } = 86_400;
/// <summary>
/// Minimum number of future months that <c>pf_AuditLog_Month</c> must
/// cover after each tick. Default 1 — i.e. as of mid-May the partition
/// for the next full month (June) must already be present.
/// </summary>
public int LookaheadMonths { get; set; } = 1;
}
@@ -0,0 +1,151 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
/// missing future boundaries — the partition function must always cover at
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
/// future months, otherwise inserts past the highest boundary accumulate in
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
/// cannot purge cleanly.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not an actor.</b> Bundle C's
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
/// because it needs supervised lifecycle alongside the rest of the
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
/// chore with no cross-actor coordination, so we use the much simpler
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
/// between ticks, cancellation on stop. Reusing
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
/// keeps the contract testable without any actor framework involvement.
/// </para>
/// <para>
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
/// a try/catch so a transient SQL Server error never tears down the hosted
/// service — the next tick simply retries. The exception is logged with
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
/// E's central health collector) can subscribe to the logger to alert on
/// repeated failures.
/// </para>
/// <para>
/// <b>Startup ordering.</b> A first tick fires immediately at
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
/// the partition function to come up to spec. This is also what the brief
/// asks for ("Run once on startup").
/// </para>
/// <para>
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
/// (alongside the rest of the EF repositories) because the implementation
/// reuses the per-scope <c>ScadaBridgeDbContext</c>. A hosted service is a
/// singleton, so it must open and dispose a scope around each tick — the
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
private CancellationTokenSource? _cts;
private Task? _loop;
/// <summary>
/// Initializes the maintenance service with its required dependencies.
/// </summary>
/// <param name="scopeFactory">Scope factory used to open DI scopes for each maintenance run.</param>
/// <param name="options">Partition maintenance options (retention period, purge interval, etc.).</param>
/// <param name="logger">Logger for this service.</param>
public AuditLogPartitionMaintenanceService(
IServiceScopeFactory scopeFactory,
IOptions<AuditLogPartitionMaintenanceOptions> options,
ILogger<AuditLogPartitionMaintenanceService> logger)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// Run once on startup so a fresh deployment isn't gated on the
// IntervalSeconds initial wait — the brief calls this out explicitly.
await SafeMaintainAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeMaintainAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeMaintainAsync(CancellationToken ct)
{
try
{
await using var scope = _scopeFactory.CreateAsyncScope();
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
var added = await maintenance
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
.ConfigureAwait(false);
if (added.Count > 0)
{
_logger.LogInformation(
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
added.Count,
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
}
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of tick failure (transient SQL, DI resolution, etc.) so
// the next tick gets a chance. The brief's contract is
// "exception logged, not propagated".
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}
@@ -0,0 +1,213 @@
using System.Diagnostics;
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
/// purge. On a configurable timer (default 24 hours) the actor:
/// <list type="number">
/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
/// for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
/// than <c>DateTime.UtcNow - RetentionDays</c>.</item>
/// <item>For each eligible boundary, calls
/// <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
/// the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
/// EventStream so the Bundle E central health collector + ops surfaces
/// can subscribe without coupling to this actor.</item>
/// </list>
/// </summary>
/// <remarks>
/// <para>
/// <b>Daily cadence.</b> Partition switch is metadata-only but the
/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
/// more often than necessary trades unique-index rebuild outages for
/// negligible freshness wins. The default 24-hour interval matches
/// alog.md §10's retention policy.
/// </para>
/// <para>
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
/// failure, contention with backup, missing object) must NOT prevent the
/// other eligible boundaries from being purged on the same tick. Per-boundary
/// work runs inside its own try/catch — that per-boundary catch is what
/// keeps the singleton alive across handler throws. The
/// <see cref="SupervisorStrategy"/> override returns the Akka default
/// decider (Restart) and governs children only; this actor has no children
/// today, so the override is a forward-compat placeholder.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The singleton
/// opens one DI scope per tick and reuses the same repository across every
/// boundary in that tick — mirrors the
/// <see cref="SiteAuditReconciliationActor"/> pattern.
/// </para>
/// <para>
/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
/// the EventStream rather than direct messaging avoids coupling this actor
/// to its consumers; M6 Bundle E will subscribe a central health-counter
/// bridge that surfaces purge progress on the central health report.
/// </para>
/// </remarks>
public class AuditLogPurgeActor : ReceiveActor
{
private readonly IServiceProvider _services;
private readonly AuditLogPurgeOptions _purgeOptions;
private readonly AuditLogOptions _auditOptions;
private readonly ILogger<AuditLogPurgeActor> _logger;
private ICancelable? _timer;
/// <summary>Initializes a new instance of <see cref="AuditLogPurgeActor"/> and registers the tick handler.</summary>
/// <param name="services">DI service provider used to create scoped repository instances per tick.</param>
/// <param name="purgeOptions">Options controlling the purge interval.</param>
/// <param name="auditOptions">Options controlling retention policy (RetentionDays).</param>
/// <param name="logger">Logger instance.</param>
public AuditLogPurgeActor(
IServiceProvider services,
IOptions<AuditLogPurgeOptions> purgeOptions,
IOptions<AuditLogOptions> auditOptions,
ILogger<AuditLogPurgeActor> logger)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(purgeOptions);
ArgumentNullException.ThrowIfNull(auditOptions);
ArgumentNullException.ThrowIfNull(logger);
_services = services;
_purgeOptions = purgeOptions.Value;
_auditOptions = auditOptions.Value;
_logger = logger;
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
var interval = _purgeOptions.Interval;
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
initialDelay: interval,
interval: interval,
receiver: Self,
message: PurgeTick.Instance,
sender: Self);
}
/// <inheritdoc />
protected override void PostStop()
{
_timer?.Cancel();
base.PostStop();
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: 0,
withinTimeRange: TimeSpan.Zero,
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
}
private async Task OnTickAsync()
{
// Capture EventStream BEFORE the first await. Accessing Context (and
// therefore Context.System) after an await is unsafe because Akka's
// ActorBase.Context throws "no active ActorContext" once the
// continuation runs on a thread that isn't currently dispatching this
// actor — mirrors the same Sender-capture pattern in
// AuditLogIngestActor.OnIngestAsync.
var eventStream = Context.System.EventStream;
// Compute the retention threshold from AuditLogOptions.RetentionDays
// each tick — the options class supports hot reload via
// IOptionsMonitor for the redaction policy and similar settings; we
// read the snapshot per-tick so an operator who lowers RetentionDays
// sees the change applied on the next purge without an actor
// restart.
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
await using var scope = _services.CreateAsyncScope();
IAuditLogRepository repository;
try
{
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
return;
}
IReadOnlyList<DateTime> boundaries;
try
{
boundaries = await repository
.GetPartitionBoundariesOlderThanAsync(threshold)
.ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(
ex,
"Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
threshold);
return;
}
if (boundaries.Count == 0)
{
return;
}
foreach (var boundary in boundaries)
{
// Per-boundary try/catch: one bad partition (transient SQL
// failure, missing object, contention with backup) does NOT
// abandon the rest of the tick.
var sw = Stopwatch.StartNew();
try
{
var rowsDeleted = await repository
.SwitchOutPartitionAsync(boundary)
.ConfigureAwait(false);
sw.Stop();
eventStream.Publish(
new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
_logger.LogInformation(
"Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
boundary,
rowsDeleted,
sw.ElapsedMilliseconds);
}
catch (Exception ex)
{
sw.Stop();
_logger.LogError(
ex,
"Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
boundary,
sw.ElapsedMilliseconds);
}
}
}
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
internal sealed class PurgeTick
{
public static readonly PurgeTick Instance = new();
private PurgeTick() { }
}
}
@@ -0,0 +1,43 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central <see cref="AuditLogPurgeActor"/> singleton.
/// Default cadence is 24 hours per the M6 plan; the retention window itself
/// is sourced from <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.RetentionDays"/>
/// (default 365) so operators tune retention from a single section.
/// </summary>
/// <remarks>
/// <para>
/// The purge actor is a daily-cadence singleton, not a hot-loop, because
/// partition-switch I/O is metadata-only but the drop-and-rebuild dance
/// briefly removes the <c>UX_AuditLog_EventId</c> unique index — running
/// more often than necessary trades index-rebuild outages for marginal
/// freshness gains. Lower this only when an operator can prove they need
/// sub-daily purge granularity.
/// </para>
/// <para>
/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
/// milliseconds without polluting the production config surface; production
/// binds <see cref="IntervalHours"/> only.
/// </para>
/// </remarks>
public sealed class AuditLogPurgeOptions
{
/// <summary>Period of the purge tick in hours (default 24).</summary>
public int IntervalHours { get; set; } = 24;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-hour resolution allows. When non-null, takes precedence over
/// <see cref="IntervalHours"/>. Not bound from config — production
/// config exposes <see cref="IntervalHours"/> only.
/// </summary>
public TimeSpan? IntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override
/// when set. Falls back to <see cref="IntervalHours"/>.
/// </summary>
public TimeSpan Interval =>
IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
}
@@ -0,0 +1,29 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Published on the actor-system EventStream by <see cref="AuditLogPurgeActor"/>
/// after each successful partition switch-out. Downstream consumers (Bundle E
/// central health collector, ops dashboards, audit trails) subscribe so a
/// purge action is observable without the actor needing to know about any
/// specific subscriber.
/// </summary>
/// <param name="MonthBoundary">
/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e.
/// the first instant of the purged month in UTC.
/// </param>
/// <param name="RowsDeleted">
/// Approximate row count purged from the partition, sampled BEFORE the
/// switch. Exact accounting would require a post-switch scan of the staging
/// table, which the dance drops immediately, so this is the closest
/// observable proxy. Zero is a valid value when the actor's enumerator
/// included a partition the operator subsequently emptied by hand.
/// </param>
/// <param name="DurationMs">
/// Wall-clock time spent inside <c>SwitchOutPartitionAsync</c> for this
/// boundary, in milliseconds. Useful for spotting the rare slow purge
/// without spinning up dedicated telemetry.
/// </param>
public sealed record AuditLogPurgedEvent(
DateTime MonthBoundary,
long RowsDeleted,
long DurationMs);
@@ -0,0 +1,61 @@
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T9) — bridges
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
/// parameter redactor stage throws and the filter has to over-redact the
/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
/// failure surfaces on the central health surface as
/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Site vs central.</b> M5 Bundle C wired the SITE-side bridge
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>),
/// which routes increments into the site health report payload's
/// <c>AuditRedactionFailure</c> field. That handles redactor failures on the
/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the
/// MIRROR bridge here so the same payload filter — when it runs on the
/// central <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/> paths — surfaces its failures on the
/// central dashboard rather than disappearing into a NoOp.
/// </para>
/// <para>
/// <b>Registration shape.</b> Site composition roots call
/// <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>,
/// which overrides the binding with the site bridge. Central composition
/// roots call <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>,
/// which overrides with this central bridge. A node never wears both hats —
/// site and central are distinct host roles — so the two bridges never
/// fight over the same binding at runtime.
/// </para>
/// <para>
/// <b>Why not a thin wrapper around the snapshot directly?</b> The snapshot
/// itself <i>could</i> be the bound implementation (it already implements
/// <see cref="IAuditRedactionFailureCounter"/>), but a dedicated class makes
/// the central-vs-site asymmetry explicit at the DI boundary — readers of
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>
/// see "site → site bridge, central → central bridge", matching the
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>
/// shape one-for-one.
/// </para>
/// </remarks>
public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter
{
private readonly AuditCentralHealthSnapshot _snapshot;
/// <summary>
/// Initializes a new <see cref="CentralAuditRedactionFailureCounter"/> backed by the supplied snapshot.
/// </summary>
/// <param name="snapshot">The central health snapshot that accumulates the redaction failure count.</param>
public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot)
{
_snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
}
/// <inheritdoc/>
public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment();
}
@@ -0,0 +1,159 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central-only direct-write implementation of <see cref="ICentralAuditWriter"/>.
/// Wraps <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> as a best-effort
/// audit emission path for components that originate audit events ON the central
/// node (Notification Outbox dispatch, Inbound API) — NOT for site telemetry
/// ingest (that path is the SiteAudit → AuditLogIngestActor batched flow).
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort contract.</b> Audit-write failures NEVER abort the user-facing
/// action (alog.md §13). The writer catches every exception thrown by repository
/// resolution or the insert call, logs at warning, and returns successfully.
/// Callers may still wrap the call in their own try/catch (defensive — the writer
/// is supposed to swallow).
/// </para>
/// <para>
/// <b>Scope-per-call resolution.</b> <see cref="IAuditLogRepository"/> is a SCOPED
/// EF Core service (registered by <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase</c>). The
/// writer itself is registered as a singleton (so all callers share one instance),
/// so it cannot hold a scope across calls — it opens a fresh
/// <see cref="IServiceScope"/> per <see cref="WriteAsync"/> invocation, mirroring
/// the per-message scope pattern used by <c>AuditLogIngestActor</c> and
/// <c>NotificationOutboxActor</c>.
/// </para>
/// <para>
/// <b>Idempotency.</b> Persistence is via <c>InsertIfNotExistsAsync</c>, so a
/// double-emitted event (same <see cref="AuditEvent.EventId"/>) is a silent
/// no-op — the writer is safe to call from any number of dispatch paths.
/// </para>
/// </remarks>
public sealed class CentralAuditWriter : ICentralAuditWriter
{
private readonly IServiceProvider _services;
private readonly ILogger<CentralAuditWriter> _logger;
private readonly IAuditPayloadFilter _filter;
private readonly ICentralAuditWriteFailureCounter _failureCounter;
private readonly INodeIdentityProvider? _nodeIdentity;
/// <summary>
/// Bundle C (M5-T6) — the central direct-write path used by the
/// NotificationOutboxActor dispatch and the Inbound API middleware also
/// needs to truncate + redact before the row hits MS SQL. The filter is
/// optional so the M4 test composition roots that don't pass one keep
/// working (they only ever write small payloads); production DI registers
/// the real filter via <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
/// M6 Bundle E (T8) — adds the optional
/// <see cref="ICentralAuditWriteFailureCounter"/> so a swallowed repository
/// throw bumps the central health surface's
/// <c>CentralAuditWriteFailures</c> counter. Defaults to a NoOp so test
/// composition roots that don't wire the counter keep their current
/// behaviour. SourceNode-stamping (Task 12) — adds the optional
/// <see cref="INodeIdentityProvider"/> so central-origin rows (Notification
/// Outbox dispatch, Inbound API) carry the writing central node's
/// identifier when the caller hasn't already supplied one. Optional /
/// defaulting-to-null so M4 test composition roots that don't pass a
/// provider keep working — the caller-wins discipline means an absent
/// provider simply leaves SourceNode at whatever the caller set (often
/// null, which is the legacy behaviour).
/// </summary>
/// <param name="services">Service provider used to open a per-call scope for the scoped repository.</param>
/// <param name="logger">Logger for swallowed write-failure diagnostics.</param>
/// <param name="filter">Optional payload filter for truncation and redaction; defaults to a pass-through.</param>
/// <param name="failureCounter">Optional counter incremented on swallowed repository failures; defaults to a no-op.</param>
/// <param name="nodeIdentity">Optional node identity provider for stamping <c>SourceNode</c> on central-origin rows.</param>
public CentralAuditWriter(
IServiceProvider services,
ILogger<CentralAuditWriter> logger,
IAuditPayloadFilter? filter = null,
ICentralAuditWriteFailureCounter? failureCounter = null,
INodeIdentityProvider? nodeIdentity = null)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
// AuditLog-008: never default to null — over-redact instead.
// SafeDefaultAuditPayloadFilter applies HTTP header redaction with
// hard-coded sensitive defaults so a composition root that omits the
// real filter still scrubs Authorization / X-Api-Key / Cookie /
// Set-Cookie before persistence.
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
_failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
_nodeIdentity = nodeIdentity;
}
/// <inheritdoc />
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
if (evt is null)
{
// Defensive — a null event is a programming bug at the caller and
// produces no meaningful audit row. Log and return.
_logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
return;
}
try
{
// Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
// filter contract is "never throws". AuditLog-008: _filter is now
// non-null (SafeDefaultAuditPayloadFilter fallback) so header
// redaction always runs even in composition roots that omit the
// real filter.
var filtered = _filter.Apply(evt);
// SourceNode-stamping (Task 12): caller-provided value wins
// (supports any future direct-write callsite that already has its
// own node id); otherwise stamp from the local
// INodeIdentityProvider, when one is wired. Production DI on
// central nodes always supplies the provider; legacy test
// composition roots that don't pass it leave SourceNode at
// whatever the caller set (often null), preserving back-compat.
if (filtered.SourceNode is null && _nodeIdentity?.NodeName is { } nodeName)
{
filtered = filtered with { SourceNode = nodeName };
}
await using var scope = _services.CreateAsyncScope();
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
// Audit failure NEVER aborts the user-facing action — swallow and log.
// M6 Bundle E (T8): also surface the failure on the central health
// counter so a sustained audit-write outage is visible on the
// health dashboard rather than disappearing into the log file.
try
{
_failureCounter.Increment();
}
catch
{
// Counter must NEVER throw — defence in depth. Even if a
// misbehaving custom counter does, swallowing here keeps the
// best-effort contract intact.
}
// Log the input event's identifying fields. These three (EventId,
// Kind, Status) are immutable across the filter+stamp chain — the
// `with` clones above touch only SourceNode and IngestedAtUtc — so
// referencing `evt` here is intentional and equivalent to the
// stamped record for diagnostics. If you add a field here that the
// stamp chain DOES mutate (e.g., SourceNode), reference the latest
// post-stamp record name instead, not `evt`.
_logger.LogWarning(
ex,
"CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
evt.EventId, evt.Kind, evt.Status);
}
}
}
@@ -0,0 +1,62 @@
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side
/// audit-health counters: <see cref="CentralAuditWriteFailures"/> (every
/// repository insert throw from <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/>), <see cref="AuditRedactionFailure"/>
/// (every payload-filter redactor throw on the central path), and
/// <see cref="SiteAuditTelemetryStalled"/> (per-site latched state from the
/// <see cref="SiteAuditTelemetryStalledTracker"/>).
/// </summary>
/// <remarks>
/// <para>
/// <b>Read-only contract.</b> Implementations expose a point-in-time snapshot
/// — increments and tracker updates happen through the dedicated counter /
/// tracker interfaces, not through this surface. Consumers (M7+ central
/// health pages) read these properties; they never mutate.
/// </para>
/// <para>
/// <b>Why a parallel surface from <see cref="ICentralHealthAggregator"/>.</b>
/// <see cref="ICentralHealthAggregator"/> aggregates per-site
/// <c>SiteHealthState</c> reports the SITE emits. The central audit-write
/// failure / redaction-failure counters originate ON central (no site report
/// carries them), so they live on a dedicated snapshot rather than being
/// retro-fitted into a per-site state. The two surfaces will be composed at
/// the M7 dashboard layer.
/// </para>
/// </remarks>
public interface IAuditCentralHealthSnapshot
{
/// <summary>
/// Count of central-side audit-write failures since process start.
/// Incremented by every <see cref="CentralAuditWriter"/> /
/// <see cref="AuditLogIngestActor"/> repository insert that throws.
/// </summary>
int CentralAuditWriteFailures { get; }
/// <summary>
/// Count of central-side payload-filter redactor over-redactions since
/// process start. Incremented by every header / body / SQL-parameter
/// redactor stage that throws (the filter falls back to the
/// <c>&lt;redacted: redactor error&gt;</c> marker and never aborts the
/// user-facing action). Sites have their own counter
/// (<see cref="IAuditRedactionFailureCounter"/>-backed
/// <c>SiteHealthReport.AuditRedactionFailure</c>) and the central
/// composition root's binding routes ALL central redactor throws
/// (CentralAuditWriter + AuditLogIngestActor paths) into this counter.
/// </summary>
int AuditRedactionFailure { get; }
/// <summary>
/// Per-site latched stalled state: <c>true</c> when the
/// <see cref="SiteAuditReconciliationActor"/> has observed two
/// consecutive non-draining cycles for that site, <c>false</c> after the
/// first draining cycle. Sites absent from the map are interpreted as
/// healthy (<c>Stalled=false</c> default). Snapshot is a defensive
/// copy — readers must not mutate.
/// </summary>
IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled { get; }
}
@@ -0,0 +1,23 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit
/// writers (<see cref="CentralAuditWriter"/>, <see cref="AuditLogIngestActor"/>)
/// every time a repository <c>InsertIfNotExistsAsync</c> throws. Mirrors the
/// site-side <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.IAuditWriteFailureCounter"/>
/// shape one-for-one — same one-method contract, same NoOp default, same
/// must-never-abort-the-user-facing-action invariant.
/// </summary>
/// <remarks>
/// Audit-write failures NEVER abort the user-facing action (alog.md §13) —
/// the writer swallows the exception and surfaces the failure via this counter
/// instead. A NoOp default is the correct safe fallback while the central
/// health surface is being wired in; <see cref="AuditCentralHealthSnapshot"/>
/// is the production binding that routes increments into the aggregated
/// central health snapshot consumed by future M7+ pages.
/// </remarks>
public interface ICentralAuditWriteFailureCounter
{
/// <summary>Increment the central audit-write failure counter by one.</summary>
void Increment();
}
@@ -0,0 +1,49 @@
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Mockable abstraction over the central-side <c>PullAuditEvents</c> gRPC
/// client surface that <see cref="SiteAuditReconciliationActor"/> uses to
/// fetch the next reconciliation batch from a specific site. Extracted so the
/// actor can be unit-tested against an in-memory stub without standing up a
/// real <c>GrpcChannel</c> per site.
/// </summary>
/// <remarks>
/// <para>
/// The production implementation (host wiring task) wraps the auto-generated
/// <c>SiteStreamService.SiteStreamServiceClient</c>, multiplexing one
/// <c>GrpcChannel</c> per site keyed on
/// <see cref="SiteEntry.GrpcEndpoint"/>. Until that wiring lands the DI
/// composition root binds a NoOp default that returns an empty response — the
/// reconciliation tick is still scheduled and the cursor logic still runs, so
/// regressions in the actor itself are caught even before the real client
/// arrives.
/// </para>
/// <para>
/// Implementations MUST NOT throw on transport faults that the actor can
/// tolerate (connection refused, deadline exceeded). The actor's contract is
/// "one site's failure doesn't sink the rest of the tick"; an exception still
/// won't crash the actor (the per-site try/catch catches it), but returning
/// an empty response on a known-recoverable error keeps the logs cleaner.
/// </para>
/// </remarks>
public interface IPullAuditEventsClient
{
/// <summary>
/// Issues a <c>PullAuditEvents</c> RPC against the site whose endpoint
/// is registered against <paramref name="siteId"/>. Returns the next
/// batch of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
/// rows ordered oldest-first AND a <c>MoreAvailable</c> flag the actor
/// uses to decide whether to fire another pull immediately.
/// </summary>
/// <param name="siteId">The identifier of the site to pull audit events from.</param>
/// <param name="sinceUtc">Only events with an <c>OccurredAtUtc</c> at or after this cursor time are returned.</param>
/// <param name="batchSize">Maximum number of events to return per call.</param>
/// <param name="ct">Cancellation token.</param>
Task<PullAuditEventsResponse> PullAsync(
string siteId,
DateTime sinceUtc,
int batchSize,
CancellationToken ct);
}
@@ -0,0 +1,35 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Enumeration surface consumed by <see cref="SiteAuditReconciliationActor"/> to
/// discover which sites to poll on each reconciliation tick. Extracted so the
/// actor can be unit-tested against a static list without depending on the
/// production <c>ISiteRepository</c> + EF Core DbContext.
/// </summary>
/// <remarks>
/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
/// site's configured <c>GrpcNodeAAddress</c> (falling back to
/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
/// configured are silently skipped — the reconciliation pull cannot reach
/// them, but absence of an address is a configuration decision, not a runtime
/// error.
/// </remarks>
public interface ISiteEnumerator
{
/// <summary>
/// Returns the current set of sites the reconciliation puller should visit
/// on the next tick. Implementations should reflect adds/removes promptly
/// — the actor calls this once per tick.
/// </summary>
/// <param name="ct">Cancellation token for the async enumeration.</param>
Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
}
/// <summary>
/// One reconciliation target: the site identifier the actor uses as the
/// cursor key and the gRPC endpoint <see cref="IPullAuditEventsClient"/> dials
/// to issue the pull. Endpoint is the bare authority (e.g. <c>http://siteA:8083</c>);
/// transport selection (TLS, keepalive, etc.) is the client's concern.
/// </summary>
public sealed record SiteEntry(string SiteId, string GrpcEndpoint);
@@ -0,0 +1,17 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Default <see cref="ICentralAuditWriteFailureCounter"/> binding used when
/// the central health surface (<see cref="AuditCentralHealthSnapshot"/>) has
/// not been wired (test composition roots, site-only hosts that incidentally
/// resolve a <see cref="CentralAuditWriter"/>). Drops every increment on the
/// floor. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.NoOpAuditWriteFailureCounter"/>.
/// </summary>
public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter
{
/// <inheritdoc/>
public void Increment()
{
// intentional no-op
}
}
@@ -0,0 +1,387 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
/// known site, asks the site for any <see cref="AuditEvent"/> rows with
/// <see cref="AuditEvent.OccurredAtUtc"/> &gt;= the site's last reconciled
/// cursor, ingests them idempotently into the central
/// <see cref="IAuditLogRepository"/>, and advances the cursor.
/// </summary>
/// <remarks>
/// <para>
/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
/// missed push (gRPC blip, central restart, site offline) is eventually
/// repaired by central re-pulling whatever the site still has in
/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
/// arrivals from both paths a silent no-op.
/// </para>
/// <para>
/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
/// kept in-memory for the actor's lifetime. The cluster singleton normally
/// survives the host process; on a deliberate failover OR a singleton restart
/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
/// but correct — the next tick simply asks for everything the site still has,
/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
/// considered and rejected for M6: the cost of a write per tick outweighs the
/// rare benefit of avoiding one over-broad pull after a restart.
/// </para>
/// <para>
/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
/// — i.e. the backlog isn't draining. The actor publishes
/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
/// can flip the health metric without coupling this actor to the health
/// collection surface today.
/// </para>
/// <para>
/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
/// repository write) must NOT prevent other sites from being polled on the
/// same tick. The per-site work runs inside its own try/catch — that
/// per-site catch is what keeps the actor running across handler throws.
/// The <see cref="SupervisorStrategy"/> override returns
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart
/// semantics) and governs children only; this actor has no children today,
/// so the override is a forward-compat placeholder. If it ever did fire,
/// restart would reset the in-memory cursors — but as noted above that's
/// a safe (over-pull, idempotent) recovery.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
/// opens one DI scope per tick and reuses the same repository across all
/// sites in that tick — one DbContext per tick mirrors the
/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
/// </para>
/// </remarks>
public class SiteAuditReconciliationActor : ReceiveActor
{
private readonly ISiteEnumerator _sites;
private readonly IPullAuditEventsClient _client;
private readonly IServiceProvider _services;
private readonly SiteAuditReconciliationOptions _options;
private readonly ILogger<SiteAuditReconciliationActor> _logger;
/// <summary>
/// Per-site reconciliation watermark — the highest
/// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
/// tick. Asking for <c>OccurredAtUtc &gt;= cursor</c> rather than &gt;
/// is the site contract (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
/// duplicate-with-same-timestamp rows are filtered out by the idempotent
/// repository write.
/// </summary>
private readonly Dictionary<string, DateTime> _cursors = new();
/// <summary>
/// Per-site count of consecutive non-draining cycles. Resets to zero on the
/// first draining (or empty) cycle.
/// </summary>
private readonly Dictionary<string, int> _nonDrainingCycles = new();
/// <summary>
/// Per-site latched stalled state — used so the actor only publishes a
/// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
/// stalled flag actually changes, not on every tick while stalled.
/// </summary>
private readonly Dictionary<string, bool> _stalled = new();
/// <summary>
/// AuditLog-004: per-EventId retry counter for rows whose central insert
/// threw. While a row keeps failing AND is below
/// <see cref="MaxPermanentInsertAttempts"/>, the cursor is held back so the
/// next reconciliation tick re-pulls and retries the row. Crossing the
/// threshold logs Critical and permanently abandons the row (cursor
/// advances past it) so a truly broken row cannot block all subsequent
/// progress for a site. The counter is in-memory only — singleton restart
/// resets it, which is safe because the cursor also resets on restart and
/// the next tick re-pulls everything.
/// </summary>
private readonly Dictionary<Guid, int> _failedInsertAttempts = new();
/// <summary>
/// AuditLog-004: number of consecutive central-insert failures before a row
/// is permanently abandoned with a Critical log entry and the cursor is
/// allowed to advance past it. Five attempts at the 5-minute default tick
/// is ~25 min of retry budget before a stuck row stops blocking progress.
/// </summary>
private const int MaxPermanentInsertAttempts = 5;
private ICancelable? _timer;
/// <summary>
/// Initializes the reconciliation actor with its dependencies and registers the tick handler.
/// </summary>
/// <param name="sites">Enumerates the known sites to reconcile.</param>
/// <param name="client">Client used to pull audit events from individual sites.</param>
/// <param name="services">Root service provider for opening a per-tick DI scope.</param>
/// <param name="options">Reconciliation configuration (interval, page size).</param>
/// <param name="logger">Logger for reconciliation diagnostics.</param>
public SiteAuditReconciliationActor(
ISiteEnumerator sites,
IPullAuditEventsClient client,
IServiceProvider services,
IOptions<SiteAuditReconciliationOptions> options,
ILogger<SiteAuditReconciliationActor> logger)
{
ArgumentNullException.ThrowIfNull(sites);
ArgumentNullException.ThrowIfNull(client);
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(options);
ArgumentNullException.ThrowIfNull(logger);
_sites = sites;
_client = client;
_services = services;
_options = options.Value;
_logger = logger;
ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
var interval = _options.ReconciliationInterval;
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
initialDelay: interval,
interval: interval,
receiver: Self,
message: ReconciliationTick.Instance,
sender: Self);
}
/// <inheritdoc />
protected override void PostStop()
{
_timer?.Cancel();
base.PostStop();
}
private async Task OnTickAsync()
{
// Capture EventStream BEFORE the first await. Accessing Context (and
// therefore Context.System) after an await is unsafe because Akka's
// ActorBase.Context throws "no active ActorContext" once the
// continuation runs on a thread that isn't currently dispatching this
// actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the
// AuditLogIngestActor.OnIngestAsync Sender-capture pattern.
var eventStream = Context.System.EventStream;
IReadOnlyList<SiteEntry> sites;
try
{
sites = await _sites.EnumerateAsync().ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
return;
}
if (sites.Count == 0)
{
return;
}
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
// services (IAsyncDisposable DbContexts) dispose asynchronously
// without blocking on sync Dispose() of pending connection cleanup.
await using var scope = _services.CreateAsyncScope();
IAuditLogRepository repository;
try
{
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
return;
}
foreach (var site in sites)
{
try
{
await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false);
}
catch (Exception ex)
{
// Catch-all per the failure-isolation invariant: one site's
// fault must not sink the rest of the tick. The cursor for
// the failing site is left at its previous value so the
// next tick retries the same window.
_logger.LogWarning(
ex,
"Reconciliation pull failed for site {SiteId}; other sites continue.",
site.SiteId);
}
}
}
/// <summary>
/// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
/// returned rows idempotently into the central repository, and advances
/// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
/// observed. The brief's "saturate until backlog clears" intent is met by
/// the natural cadence — each tick issues one pull, and a backed-up site
/// drains across consecutive ticks. The stalled signal (two non-draining
/// ticks in a row) surfaces when that drain isn't keeping up.
/// </summary>
private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream)
{
var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
var response = await _client.PullAsync(
site.SiteId, since, _options.BatchSize, CancellationToken.None)
.ConfigureAwait(false);
var maxOccurred = since;
var hasUnresolvedFailure = false;
var nowUtc = DateTime.UtcNow;
foreach (var evt in response.Events)
{
var advanceForThisRow = false;
try
{
// Idempotent repository write: duplicate EventIds (from a
// concurrent push, or a retry of this very pull) collapse to
// a no-op courtesy of M2 Bundle A's race-fix on
// InsertIfNotExistsAsync.
var ingested = evt with { IngestedAtUtc = nowUtc };
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
_failedInsertAttempts.Remove(evt.EventId);
advanceForThisRow = true;
}
catch (Exception ex)
{
// AuditLog-004: per-row catch so one bad event does not abandon
// the rest of the batch. Track the failure count per EventId —
// below MaxPermanentInsertAttempts the cursor is HELD BACK so
// the next tick re-pulls and retries; at the threshold the row
// is permanently abandoned (LogCritical + cursor advances past)
// to keep a truly broken row from blocking all subsequent
// progress for the site.
var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1;
_failedInsertAttempts[evt.EventId] = attempts;
if (attempts >= MaxPermanentInsertAttempts)
{
_logger.LogCritical(
ex,
"Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.",
evt.EventId,
site.SiteId,
attempts);
_failedInsertAttempts.Remove(evt.EventId);
advanceForThisRow = true;
}
else
{
_logger.LogError(
ex,
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.",
evt.EventId,
site.SiteId,
attempts,
MaxPermanentInsertAttempts);
hasUnresolvedFailure = true;
}
}
if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
{
maxOccurred = evt.OccurredAtUtc;
}
}
// AuditLog-004: only advance the persisted cursor if no event in this
// batch is still being retried. Leaving the cursor at `since` re-pulls
// the whole batch next tick — successful rows are no-ops thanks to
// InsertIfNotExistsAsync's idempotency, and the failing row gets
// another attempt. Once it succeeds (or hits the permanent-abandon
// threshold) the cursor unblocks naturally.
_cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred;
var nonDraining = response.MoreAvailable && response.Events.Count > 0;
UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
}
/// <summary>
/// Flips the per-site stalled flag based on whether this tick drained the
/// queue. A "draining" cycle is one where the server reported no more rows
/// available OR returned zero events. A "non-draining" cycle is the
/// inverse (events returned AND <c>MoreAvailable=true</c>).
/// </summary>
/// <remarks>
/// The state machine: counter increments on each consecutive non-draining
/// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
/// the actor latches <c>Stalled=true</c> and publishes the transition; on
/// any subsequent draining tick the counter resets to zero AND, if the
/// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
/// transitions are published — repeated ticks in the same state are
/// silent so a downstream subscriber doesn't see a flood of redundant
/// notifications.
/// </remarks>
private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream)
{
var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
if (draining)
{
_nonDrainingCycles[siteId] = 0;
if (wasStalled)
{
_stalled[siteId] = false;
eventStream.Publish(
new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
}
return;
}
var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
_nonDrainingCycles[siteId] = consecutive;
if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
{
_stalled[siteId] = true;
eventStream.Publish(
new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
}
}
/// <inheritdoc />
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: 0,
withinTimeRange: TimeSpan.Zero,
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
}
/// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
internal sealed class ReconciliationTick
{
public static readonly ReconciliationTick Instance = new();
private ReconciliationTick() { }
}
}
/// <summary>
/// Published on the actor system EventStream when a site's reconciliation
/// puller transitions into or out of the "stalled" state (backlog not
/// draining across multiple cycles). The M6 Bundle E central health collector
/// will subscribe to this and surface
/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
/// </summary>
public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);
@@ -0,0 +1,60 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central <see cref="SiteAuditReconciliationActor"/> singleton.
/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per
/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty
/// AND <c>MoreAvailable=true</c> (the backlog is not draining).
/// </summary>
/// <remarks>
/// <para>
/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is
/// lost; it is intentionally low-frequency. Lowering
/// <see cref="ReconciliationIntervalSeconds"/> in production trades MS SQL load for
/// fresher self-healing — keep the default unless a deployment can prove the extra
/// load is acceptable.
/// </para>
/// <para>
/// <see cref="StalledAfterNonDrainingCycles"/> = 2 because a single non-draining
/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the
/// stalled signal should only fire when the backlog persists across cycles, which is
/// the symptom the central health surface is asking us to detect.
/// </para>
/// </remarks>
public sealed class SiteAuditReconciliationOptions
{
/// <summary>
/// Period of the reconciliation tick. Each tick visits every known site once.
/// </summary>
public int ReconciliationIntervalSeconds { get; set; } = 300;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-second resolution allows. When non-null, takes precedence over
/// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
/// production config exposes <see cref="ReconciliationIntervalSeconds"/>
/// only.
/// </summary>
public TimeSpan? ReconciliationIntervalOverride { get; set; }
/// <summary>
/// Resolves the effective tick interval, honouring the test override when
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
/// </summary>
public TimeSpan ReconciliationInterval =>
ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
/// <summary>
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
/// rows requested in a single <c>PullAuditEvents</c> RPC call.
/// </summary>
public int BatchSize { get; set; } = 256;
/// <summary>
/// Number of consecutive non-draining cycles (events returned AND
/// <c>MoreAvailable=true</c>) that must accumulate for a site before the actor
/// publishes <c>SiteAuditTelemetryStalledChanged(Stalled: true)</c> on the
/// EventStream.
/// </summary>
public int StalledAfterNonDrainingCycles { get; set; } = 2;
}
@@ -0,0 +1,203 @@
using System.Collections.Concurrent;
using Akka.Actor;
using Akka.Event;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the
/// actor system's EventStream for <see cref="SiteAuditTelemetryStalledChanged"/>
/// publications and maintains a per-site latched stalled-state map readable
/// via <see cref="Snapshot"/>. Consumed by the M6 Bundle E
/// <see cref="AuditCentralHealthSnapshot"/> aggregator so the central health
/// surface can surface per-site "reconciliation isn't draining" without
/// coupling the publisher (<see cref="SiteAuditReconciliationActor"/>) to the
/// health collection plumbing.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why an internal actor.</b> Akka.NET's <see cref="EventStream"/> only
/// supports <see cref="IActorRef"/> subscribers — there is no callback or
/// channel-based overload. The tracker therefore spawns a small subscriber
/// actor that forwards each event into the shared
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> on the actor's thread, and
/// readers (<see cref="Snapshot"/>) take a copy off that dictionary on any
/// thread. Mirrors the <c>DeadLetterMonitorActor</c> shape — subscribe in
/// <see cref="ActorBase.PreStart"/>, unsubscribe in
/// <see cref="ActorBase.PostStop"/>, which the tracker triggers via a Stop
/// at <see cref="Dispose"/>.
/// </para>
/// <para>
/// <b>Per-site latching.</b> The publisher (<see cref="SiteAuditReconciliationActor"/>)
/// only publishes on stalled-state transitions, so the dictionary is the
/// authoritative latched state. Sites that have never published are absent
/// from the snapshot — the consumer surface treats absence as
/// <c>Stalled=false</c> (default healthy), the same default the reconciliation
/// actor's own internal latch uses.
/// </para>
/// <para>
/// <b>Singleton lifecycle.</b> Registered as a singleton via
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>;
/// <see cref="Dispose"/> tears the internal subscriber down at host shutdown.
/// </para>
/// </remarks>
public sealed class SiteAuditTelemetryStalledTracker : IDisposable
{
private readonly EventStream _eventStream;
private readonly ConcurrentDictionary<string, bool> _state = new();
private readonly IActorRef? _subscriber;
private readonly AuditCentralHealthSnapshot? _snapshot;
private bool _disposed;
/// <summary>
/// Construct around a bare <see cref="EventStream"/>. Intended for unit
/// tests where the caller wants to publish events without standing up an
/// actor system — the tracker registers a transient subscriber actor only
/// if the supplied stream is backed by an actor system. In the bare-stream
/// mode (no actor system) the tracker still exposes the
/// <see cref="Snapshot"/> surface but cannot self-subscribe; production
/// callers always go through <see cref="SiteAuditTelemetryStalledTracker(ActorSystem)"/>.
/// </summary>
/// <remarks>
/// Subscribing to <see cref="EventStream"/> requires an <see cref="IActorRef"/>,
/// which can only be created from an <see cref="ActorSystem"/>. The bare-
/// stream ctor therefore can NOT itself wire the subscriber — tests that
/// want event-driven updates must use the ActorSystem ctor (or push state
/// directly via <see cref="Apply"/>). The tests in
/// <c>SiteAuditTelemetryStalledTrackerTests</c> use the ActorSystem ctor
/// via Akka.TestKit so they exercise the production subscribe path.
/// </remarks>
/// <param name="eventStream">The actor system event stream to observe.</param>
public SiteAuditTelemetryStalledTracker(EventStream eventStream)
: this(eventStream, snapshot: null)
{
}
/// <summary>
/// Bare-stream ctor with an optional snapshot sink — the central
/// composition root passes the singleton
/// <see cref="AuditCentralHealthSnapshot"/> so every dictionary update
/// also lands on the central health surface. The bare ctor still cannot
/// subscribe (no actor system), but tests that drive the tracker via
/// <see cref="Apply"/> get the snapshot push for free.
/// </summary>
/// <param name="eventStream">The actor system event stream to observe.</param>
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot)
{
_eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream));
// No subscriber actor — see the remarks on the parameterless overload.
_subscriber = null;
_snapshot = snapshot;
}
/// <summary>
/// Production ctor: subscribes a small internal actor to the supplied
/// system's EventStream so every published
/// <see cref="SiteAuditTelemetryStalledChanged"/> updates the latched
/// per-site map. <see cref="Dispose"/> tears the subscriber down.
/// </summary>
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem)
: this(actorSystem, snapshot: null)
{
}
/// <summary>
/// Production ctor with a snapshot sink — every observed
/// <see cref="SiteAuditTelemetryStalledChanged"/> is mirrored onto the
/// shared <see cref="AuditCentralHealthSnapshot"/> so the central health
/// surface sees per-site stalled state without re-reading the tracker.
/// </summary>
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot)
{
ArgumentNullException.ThrowIfNull(actorSystem);
_eventStream = actorSystem.EventStream;
_snapshot = snapshot;
// Anonymous subscriber actor scoped to the system; props build it
// with a callback into THIS tracker's Apply method so the actor's
// single-threaded receive serialises every dictionary write.
_subscriber = actorSystem.ActorOf(
Props.Create(() => new StalledChangedSubscriber(this)),
name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}");
// Subscribe synchronously from the ctor so the subscription is in
// place before the tracker is returned to the caller — the actor's
// own PreStart runs asynchronously and would otherwise race the
// first publish. EventStream.Subscribe is thread-safe.
_eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged));
}
/// <summary>
/// Returns a defensive copy of the per-site latched stalled state.
/// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
/// </summary>
public IReadOnlyDictionary<string, bool> Snapshot() =>
new Dictionary<string, bool>(_state);
/// <summary>
/// Applied by the internal subscriber actor on every
/// <see cref="SiteAuditTelemetryStalledChanged"/> publication. Exposed
/// internally so tests against the bare-stream ctor can still drive the
/// tracker, but the production path always goes through the actor.
/// </summary>
/// <param name="evt">The stalled-state change event to apply.</param>
internal void Apply(SiteAuditTelemetryStalledChanged evt)
{
if (evt is null) return;
_state[evt.SiteId] = evt.Stalled;
// Mirror into the central health snapshot if wired so a reader of
// IAuditCentralHealthSnapshot sees the same per-site state without
// a second lookup. Snapshot is optional (test composition roots may
// skip it) so the null-coalesce is the safe path.
_snapshot?.ApplyStalled(evt);
}
/// <summary>
/// Disposes the tracker and tears down the internal subscriber actor.
/// </summary>
public void Dispose()
{
if (_disposed) return;
_disposed = true;
if (_subscriber is not null)
{
// Unsubscribe runs in PostStop on the subscriber actor; Stop is
// fire-and-forget but the actor's PostStop hook is guaranteed to
// run before its mailbox is collected.
_subscriber.Tell(PoisonPill.Instance);
}
}
/// <summary>
/// Internal subscriber actor — receives every
/// <see cref="SiteAuditTelemetryStalledChanged"/> off the EventStream and
/// forwards it into the parent <see cref="SiteAuditTelemetryStalledTracker"/>.
/// Unlike <c>DeadLetterMonitorActor</c>, the subscription is registered by
/// the tracker constructor BEFORE this actor begins processing messages so
/// publishes that arrive between actor creation and PreStart cannot be
/// missed. Unsubscribe still runs in <see cref="PostStop"/>.
/// </summary>
private sealed class StalledChangedSubscriber : ReceiveActor
{
private readonly SiteAuditTelemetryStalledTracker _parent;
/// <summary>
/// Initializes a new subscriber actor that forwards events to the given tracker.
/// </summary>
/// <param name="parent">The parent tracker whose <see cref="Apply"/> method will be called for each event.</param>
public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent)
{
_parent = parent;
Receive<SiteAuditTelemetryStalledChanged>(evt => _parent.Apply(evt));
}
/// <inheritdoc />
protected override void PostStop()
{
Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged));
base.PostStop();
}
}
}