refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
using System.Collections.Concurrent;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of
|
||||
/// <see cref="IAuditCentralHealthSnapshot"/>. Owns thread-safe
|
||||
/// <see cref="System.Threading.Interlocked"/> counters for
|
||||
/// <c>CentralAuditWriteFailures</c> + <c>AuditRedactionFailure</c> and a
|
||||
/// per-site latched stalled-state map fed by the
|
||||
/// <see cref="SiteAuditTelemetryStalledTracker"/>. Also implements the
|
||||
/// writer surfaces (<see cref="ICentralAuditWriteFailureCounter"/> +
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>) so a single concrete object
|
||||
/// is the source of truth — DI binds those two interfaces to this same
|
||||
/// singleton instance on the central composition root.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why one type for read + write.</b> The writer interfaces are tiny
|
||||
/// (<c>Increment()</c>) and the read surface needs visibility of those
|
||||
/// counters anyway — having a single class own both means the
|
||||
/// <c>Interlocked</c> field IS the snapshot value, no extra plumbing needed.
|
||||
/// Mirrors the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.HealthMonitoring.SiteHealthCollector"/> pattern where
|
||||
/// the collector both receives and exposes the metric.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Stalled-state plumbing.</b> The per-site stalled latch lives directly
|
||||
/// on this snapshot. <see cref="SiteAuditTelemetryStalledTracker"/> is the
|
||||
/// EventStream subscriber that pushes
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> publications in via
|
||||
/// <see cref="ApplyStalled"/>. Keeping the dictionary on this type (rather
|
||||
/// than reading the tracker on every access) lets the snapshot be constructed
|
||||
/// without an <see cref="Akka.Actor.ActorSystem"/> dependency — the tracker
|
||||
/// is wired up later from the Akka bootstrap, once the system is built.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditCentralHealthSnapshot
|
||||
: IAuditCentralHealthSnapshot,
|
||||
ICentralAuditWriteFailureCounter,
|
||||
IAuditRedactionFailureCounter
|
||||
{
|
||||
private int _centralAuditWriteFailures;
|
||||
private int _auditRedactionFailure;
|
||||
private readonly ConcurrentDictionary<string, bool> _stalled = new();
|
||||
|
||||
/// <inheritdoc/>
|
||||
public int CentralAuditWriteFailures =>
|
||||
Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public int AuditRedactionFailure =>
|
||||
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
|
||||
new Dictionary<string, bool>(_stalled);
|
||||
|
||||
/// <summary>
|
||||
/// Apply a <see cref="SiteAuditTelemetryStalledChanged"/> publication
|
||||
/// observed by <see cref="SiteAuditTelemetryStalledTracker"/>. Public
|
||||
/// so the tracker (which lives in the same assembly but is constructed
|
||||
/// later from the Akka host) can push without a friend reference;
|
||||
/// readers should call <see cref="SiteAuditTelemetryStalled"/>.
|
||||
/// </summary>
|
||||
/// <param name="evt">The event carrying the site ID and new stalled state.</param>
|
||||
public void ApplyStalled(SiteAuditTelemetryStalledChanged evt)
|
||||
{
|
||||
if (evt is null) return;
|
||||
_stalled[evt.SiteId] = evt.Stalled;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
void ICentralAuditWriteFailureCounter.Increment() =>
|
||||
Interlocked.Increment(ref _centralAuditWriteFailures);
|
||||
|
||||
/// <inheritdoc/>
|
||||
void IAuditRedactionFailureCounter.Increment() =>
|
||||
Interlocked.Increment(ref _auditRedactionFailure);
|
||||
}
|
||||
@@ -0,0 +1,306 @@
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side singleton (per Bundle E wiring) that ingests batches of
|
||||
/// <see cref="AuditEvent"/> rows pushed from sites via the
|
||||
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
|
||||
/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
|
||||
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
|
||||
/// silently swallowed (first-write-wins per Bundle A's hardening).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Idempotency is the contract: a row that already exists at central counts
|
||||
/// as "accepted" for the purposes of the reply, because the storage state is
|
||||
/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
|
||||
/// action. The actor wraps each repository call in its own try/catch so a
|
||||
/// single bad row cannot cause the rest of the batch to be lost — that
|
||||
/// per-row catch is what keeps this actor alive across handler throws, not
|
||||
/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
|
||||
/// returns the Akka default decider (Restart for most exceptions) and
|
||||
/// governs children only; this actor has no children today, so the override
|
||||
/// is a forward-compat placeholder.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
|
||||
/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
|
||||
/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
|
||||
/// end to end), while Bundle E's host wiring registers the actor as a cluster
|
||||
/// singleton and must therefore resolve the repository — which is a scoped EF
|
||||
/// Core service — from a fresh DI scope per message. Mirroring the Notification
|
||||
/// Outbox actor's pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class AuditLogIngestActor : ReceiveActor
|
||||
{
|
||||
private readonly IServiceProvider? _serviceProvider;
|
||||
private readonly IAuditLogRepository? _injectedRepository;
|
||||
private readonly ILogger<AuditLogIngestActor> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Test-mode constructor — injects a concrete repository instance whose
|
||||
/// lifetime exceeds the test, so the actor reuses the same instance across
|
||||
/// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
|
||||
/// </summary>
|
||||
/// <param name="repository">Audit log repository instance shared across all messages.</param>
|
||||
/// <param name="logger">Logger for ingest diagnostics.</param>
|
||||
public AuditLogIngestActor(
|
||||
IAuditLogRepository repository,
|
||||
ILogger<AuditLogIngestActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(repository);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_injectedRepository = repository;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
// The single-repository test ctor cannot service the M3 dual-write —
|
||||
// it has no SiteCalls repo and no DbContext. The handler still
|
||||
// registers (so callers don't dead-letter) but replies empty so the
|
||||
// test surface stays explicit about what this ctor supports.
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production constructor — resolves <see cref="IAuditLogRepository"/> from
|
||||
/// a fresh DI scope per message because the repository is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The actor itself
|
||||
/// is a long-lived cluster singleton, so it cannot hold a scope across
|
||||
/// messages.
|
||||
/// </summary>
|
||||
/// <param name="serviceProvider">Root service provider used to open a fresh scope per message.</param>
|
||||
/// <param name="logger">Logger for ingest diagnostics.</param>
|
||||
public AuditLogIngestActor(
|
||||
IServiceProvider serviceProvider,
|
||||
ILogger<AuditLogIngestActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(serviceProvider);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_serviceProvider = serviceProvider;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
|
||||
Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
private async Task OnIngestAsync(IngestAuditEventsCommand cmd)
|
||||
{
|
||||
// Sender is captured before the first await — Akka resets Sender
|
||||
// between message dispatches, so a post-await Tell would go to
|
||||
// DeadLetters.
|
||||
var replyTo = Sender;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
var accepted = new List<Guid>(cmd.Events.Count);
|
||||
|
||||
// Resolve the repository for the whole batch — one DbContext per
|
||||
// message, mirroring NotificationOutboxActor. The injected-repository
|
||||
// mode (Bundle D tests) skips the scope entirely.
|
||||
// Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
|
||||
// per-message scope when one is available so the row is truncated +
|
||||
// redacted before InsertIfNotExistsAsync. The single-repository test
|
||||
// ctor has no service provider — it falls through with no filter,
|
||||
// which preserves the small-payload assumptions baked into the
|
||||
// existing D2 fixtures.
|
||||
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
|
||||
// services (IAsyncDisposable DbContexts) dispose asynchronously
|
||||
// without blocking on sync Dispose() of pending connection cleanup.
|
||||
if (_injectedRepository is not null)
|
||||
{
|
||||
await IngestWithRepositoryAsync(_injectedRepository, filter: null, failureCounter: null, cmd, nowUtc, accepted)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
|
||||
// M6 Bundle E (T8): central health counter is best-effort —
|
||||
// unregistered (test composition roots) means the per-row catch
|
||||
// simply logs without surfacing on the health dashboard.
|
||||
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
await IngestWithRepositoryAsync(repository, filter, failureCounter, cmd, nowUtc, accepted)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestAuditEventsReply(accepted));
|
||||
}
|
||||
|
||||
private async Task IngestWithRepositoryAsync(
|
||||
IAuditLogRepository repository,
|
||||
IAuditPayloadFilter? filter,
|
||||
ICentralAuditWriteFailureCounter? failureCounter,
|
||||
IngestAuditEventsCommand cmd,
|
||||
DateTime nowUtc,
|
||||
List<Guid> accepted)
|
||||
{
|
||||
foreach (var evt in cmd.Events)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Stamp IngestedAtUtc here, not at the site. Bundle A's
|
||||
// repository hardening already swallows duplicate-key races,
|
||||
// so the same id arriving twice (site retry, reconciliation)
|
||||
// is a silent no-op.
|
||||
// Filter BEFORE the IngestedAtUtc stamp so the redacted
|
||||
// copy carries the central-side ingest timestamp. Filter
|
||||
// is contract-bound to never throw. AuditLog-008: a null
|
||||
// filter (test composition root, no IAuditPayloadFilter
|
||||
// registered) now falls back to the SafeDefault rather than
|
||||
// pass-through, so HTTP header redaction always runs.
|
||||
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
|
||||
var filtered = safeFilter.Apply(evt);
|
||||
var ingested = filtered with { IngestedAtUtc = nowUtc };
|
||||
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
|
||||
accepted.Add(evt.EventId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Per-row catch — one bad row never sinks the whole batch.
|
||||
// The row stays Pending at the site; the next drain retries.
|
||||
// M6 Bundle E (T8): bump the central health counter so a
|
||||
// sustained insert-throw failure surfaces on the dashboard.
|
||||
try { failureCounter?.Increment(); }
|
||||
catch { /* counter must never throw — defence in depth */ }
|
||||
_logger.LogError(ex,
|
||||
"Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
|
||||
evt.EventId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
|
||||
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
|
||||
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
|
||||
/// or both roll back, so the audit and operational mirrors never drift
|
||||
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
|
||||
/// downstream join lines up cleanly.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per-entry isolation — one entry's failed transaction does NOT abort
|
||||
/// other entries in the batch (each gets its own
|
||||
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
|
||||
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
|
||||
/// user-facing action — the site keeps the row Pending and retries on the
|
||||
/// next drain.
|
||||
/// </remarks>
|
||||
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
var replyTo = Sender;
|
||||
var accepted = new List<Guid>(cmd.Entries.Count);
|
||||
|
||||
try
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
|
||||
// Bundle C (M5-T6): resolve the filter for the whole batch from
|
||||
// the scope; null = pass-through for test composition roots that
|
||||
// skip the filter registration. The filter is contract-bound to
|
||||
// never throw, so we can apply it inside the per-entry try
|
||||
// without risking an unbounded blast radius.
|
||||
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
|
||||
// M6 Bundle E (T8): same best-effort central health counter as
|
||||
// the OnIngestAsync path — null on test composition roots that
|
||||
// skip the registration.
|
||||
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
|
||||
foreach (var entry in cmd.Entries)
|
||||
{
|
||||
try
|
||||
{
|
||||
await using var tx = await dbContext.Database
|
||||
.BeginTransactionAsync()
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Stamp IngestedAtUtc on both rows from a single
|
||||
// central-side instant so a join on the two tables sees
|
||||
// matching timestamps (debugging convenience, not a
|
||||
// correctness invariant).
|
||||
var ingestedAt = DateTime.UtcNow;
|
||||
// Filter the audit half BEFORE the dual-write — only the
|
||||
// AuditLog row's payload columns are filterable; SiteCalls
|
||||
// carries operational state only (status, retry count) and
|
||||
// is left untouched. AuditLog-008: null filter falls back
|
||||
// to SafeDefault so header redaction always runs.
|
||||
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
|
||||
var filteredAudit = safeFilter.Apply(entry.Audit);
|
||||
var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
|
||||
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
|
||||
|
||||
await auditRepo.InsertIfNotExistsAsync(auditStamped)
|
||||
.ConfigureAwait(false);
|
||||
await siteCallRepo.UpsertAsync(siteCallStamped)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await tx.CommitAsync().ConfigureAwait(false);
|
||||
accepted.Add(entry.Audit.EventId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Both rows rolled back via the disposing transaction. The
|
||||
// EventId is NOT added to `accepted` so the site keeps its
|
||||
// row Pending and retries on the next drain. Other entries
|
||||
// in the batch continue with their own transactions.
|
||||
// M6 Bundle E (T8): bump the central health counter so a
|
||||
// sustained dual-write failure surfaces on the dashboard.
|
||||
try { failureCounter?.Increment(); }
|
||||
catch { /* counter must never throw — defence in depth */ }
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
|
||||
entry.Audit.EventId,
|
||||
entry.SiteCall.TrackedOperationId);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
|
||||
// reply with whatever we managed to accept (likely empty) — the
|
||||
// central singleton MUST stay alive.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry batch ingest failed before per-entry processing.");
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fallback handler installed on the single-repository test ctor — that
|
||||
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
|
||||
/// it cannot service the dual-write. Logs a warning and replies with an
|
||||
/// empty ack so callers fall through to their retry path.
|
||||
/// </summary>
|
||||
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
|
||||
cmd.Entries.Count);
|
||||
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central
|
||||
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
|
||||
/// Defaults: once every 24 hours, keep at least one future monthly
|
||||
/// boundary ahead of <see cref="DateTime.UtcNow"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The hosted service drives a daily roll-forward of
|
||||
/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
|
||||
/// SPLITs new monthly boundaries until at least
|
||||
/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
|
||||
/// default is intentionally conservative — anything less risks an
|
||||
/// end-of-month race where inserts land in the unbounded tail partition;
|
||||
/// anything more wastes nothing but represents premature commitment.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The 24-hour cadence is the cheapest interval that still guarantees
|
||||
/// at-most-one missed boundary in steady state (even a hard failover the
|
||||
/// hosted service can recover on its very next tick). Lowering this below
|
||||
/// an hour would generate more metadata churn than it saves.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPartitionMaintenanceOptions
|
||||
{
|
||||
/// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
|
||||
public int IntervalSeconds { get; set; } = 86_400;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum number of future months that <c>pf_AuditLog_Month</c> must
|
||||
/// cover after each tick. Default 1 — i.e. as of mid-May the partition
|
||||
/// for the next full month (June) must already be present.
|
||||
/// </summary>
|
||||
public int LookaheadMonths { get; set; } = 1;
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
|
||||
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
|
||||
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
|
||||
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
|
||||
/// missing future boundaries — the partition function must always cover at
|
||||
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
|
||||
/// future months, otherwise inserts past the highest boundary accumulate in
|
||||
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
|
||||
/// cannot purge cleanly.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a hosted service, not an actor.</b> Bundle C's
|
||||
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
|
||||
/// because it needs supervised lifecycle alongside the rest of the
|
||||
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
|
||||
/// chore with no cross-actor coordination, so we use the much simpler
|
||||
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
|
||||
/// between ticks, cancellation on stop. Reusing
|
||||
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
|
||||
/// keeps the contract testable without any actor framework involvement.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
|
||||
/// a try/catch so a transient SQL Server error never tears down the hosted
|
||||
/// service — the next tick simply retries. The exception is logged with
|
||||
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
|
||||
/// E's central health collector) can subscribe to the logger to alert on
|
||||
/// repeated failures.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Startup ordering.</b> A first tick fires immediately at
|
||||
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
|
||||
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
|
||||
/// the partition function to come up to spec. This is also what the brief
|
||||
/// asks for ("Run once on startup").
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
|
||||
/// (alongside the rest of the EF repositories) because the implementation
|
||||
/// reuses the per-scope <c>ScadaBridgeDbContext</c>. A hosted service is a
|
||||
/// singleton, so it must open and dispose a scope around each tick — the
|
||||
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
|
||||
{
|
||||
private readonly IServiceScopeFactory _scopeFactory;
|
||||
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
|
||||
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the maintenance service with its required dependencies.
|
||||
/// </summary>
|
||||
/// <param name="scopeFactory">Scope factory used to open DI scopes for each maintenance run.</param>
|
||||
/// <param name="options">Partition maintenance options (retention period, purge interval, etc.).</param>
|
||||
/// <param name="logger">Logger for this service.</param>
|
||||
public AuditLogPartitionMaintenanceService(
|
||||
IServiceScopeFactory scopeFactory,
|
||||
IOptions<AuditLogPartitionMaintenanceOptions> options,
|
||||
ILogger<AuditLogPartitionMaintenanceService> logger)
|
||||
{
|
||||
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// Run once on startup so a fresh deployment isn't gated on the
|
||||
// IntervalSeconds initial wait — the brief calls this out explicitly.
|
||||
await SafeMaintainAsync(ct).ConfigureAwait(false);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await SafeMaintainAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SafeMaintainAsync(CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await using var scope = _scopeFactory.CreateAsyncScope();
|
||||
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
|
||||
var added = await maintenance
|
||||
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (added.Count > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
|
||||
added.Count,
|
||||
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of tick failure (transient SQL, DI resolution, etc.) so
|
||||
// the next tick gets a chance. The brief's contract is
|
||||
// "exception logged, not propagated".
|
||||
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
using System.Diagnostics;
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
|
||||
/// purge. On a configurable timer (default 24 hours) the actor:
|
||||
/// <list type="number">
|
||||
/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
|
||||
/// for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
|
||||
/// than <c>DateTime.UtcNow - RetentionDays</c>.</item>
|
||||
/// <item>For each eligible boundary, calls
|
||||
/// <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
|
||||
/// the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
|
||||
/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
|
||||
/// EventStream so the Bundle E central health collector + ops surfaces
|
||||
/// can subscribe without coupling to this actor.</item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Daily cadence.</b> Partition switch is metadata-only but the
|
||||
/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
|
||||
/// more often than necessary trades unique-index rebuild outages for
|
||||
/// negligible freshness wins. The default 24-hour interval matches
|
||||
/// alog.md §10's retention policy.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
|
||||
/// failure, contention with backup, missing object) must NOT prevent the
|
||||
/// other eligible boundaries from being purged on the same tick. Per-boundary
|
||||
/// work runs inside its own try/catch — that per-boundary catch is what
|
||||
/// keeps the singleton alive across handler throws. The
|
||||
/// <see cref="SupervisorStrategy"/> override returns the Akka default
|
||||
/// decider (Restart) and governs children only; this actor has no children
|
||||
/// today, so the override is a forward-compat placeholder.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The singleton
|
||||
/// opens one DI scope per tick and reuses the same repository across every
|
||||
/// boundary in that tick — mirrors the
|
||||
/// <see cref="SiteAuditReconciliationActor"/> pattern.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
|
||||
/// the EventStream rather than direct messaging avoids coupling this actor
|
||||
/// to its consumers; M6 Bundle E will subscribe a central health-counter
|
||||
/// bridge that surfaces purge progress on the central health report.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class AuditLogPurgeActor : ReceiveActor
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly AuditLogPurgeOptions _purgeOptions;
|
||||
private readonly AuditLogOptions _auditOptions;
|
||||
private readonly ILogger<AuditLogPurgeActor> _logger;
|
||||
private ICancelable? _timer;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="AuditLogPurgeActor"/> and registers the tick handler.</summary>
|
||||
/// <param name="services">DI service provider used to create scoped repository instances per tick.</param>
|
||||
/// <param name="purgeOptions">Options controlling the purge interval.</param>
|
||||
/// <param name="auditOptions">Options controlling retention policy (RetentionDays).</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public AuditLogPurgeActor(
|
||||
IServiceProvider services,
|
||||
IOptions<AuditLogPurgeOptions> purgeOptions,
|
||||
IOptions<AuditLogOptions> auditOptions,
|
||||
ILogger<AuditLogPurgeActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(purgeOptions);
|
||||
ArgumentNullException.ThrowIfNull(auditOptions);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_services = services;
|
||||
_purgeOptions = purgeOptions.Value;
|
||||
_auditOptions = auditOptions.Value;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
var interval = _purgeOptions.Interval;
|
||||
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
initialDelay: interval,
|
||||
interval: interval,
|
||||
receiver: Self,
|
||||
message: PurgeTick.Instance,
|
||||
sender: Self);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_timer?.Cancel();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: 0,
|
||||
withinTimeRange: TimeSpan.Zero,
|
||||
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
private async Task OnTickAsync()
|
||||
{
|
||||
// Capture EventStream BEFORE the first await. Accessing Context (and
|
||||
// therefore Context.System) after an await is unsafe because Akka's
|
||||
// ActorBase.Context throws "no active ActorContext" once the
|
||||
// continuation runs on a thread that isn't currently dispatching this
|
||||
// actor — mirrors the same Sender-capture pattern in
|
||||
// AuditLogIngestActor.OnIngestAsync.
|
||||
var eventStream = Context.System.EventStream;
|
||||
|
||||
// Compute the retention threshold from AuditLogOptions.RetentionDays
|
||||
// each tick — the options class supports hot reload via
|
||||
// IOptionsMonitor for the redaction policy and similar settings; we
|
||||
// read the snapshot per-tick so an operator who lowers RetentionDays
|
||||
// sees the change applied on the next purge without an actor
|
||||
// restart.
|
||||
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
|
||||
|
||||
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
|
||||
// services (IAsyncDisposable DbContexts) dispose asynchronously
|
||||
// without blocking on sync Dispose() of pending connection cleanup.
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
IAuditLogRepository repository;
|
||||
try
|
||||
{
|
||||
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
|
||||
return;
|
||||
}
|
||||
|
||||
IReadOnlyList<DateTime> boundaries;
|
||||
try
|
||||
{
|
||||
boundaries = await repository
|
||||
.GetPartitionBoundariesOlderThanAsync(threshold)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
|
||||
threshold);
|
||||
return;
|
||||
}
|
||||
|
||||
if (boundaries.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (var boundary in boundaries)
|
||||
{
|
||||
// Per-boundary try/catch: one bad partition (transient SQL
|
||||
// failure, missing object, contention with backup) does NOT
|
||||
// abandon the rest of the tick.
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
var rowsDeleted = await repository
|
||||
.SwitchOutPartitionAsync(boundary)
|
||||
.ConfigureAwait(false);
|
||||
sw.Stop();
|
||||
|
||||
eventStream.Publish(
|
||||
new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
|
||||
|
||||
_logger.LogInformation(
|
||||
"Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
|
||||
boundary,
|
||||
rowsDeleted,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
|
||||
boundary,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
|
||||
internal sealed class PurgeTick
|
||||
{
|
||||
public static readonly PurgeTick Instance = new();
|
||||
private PurgeTick() { }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central <see cref="AuditLogPurgeActor"/> singleton.
|
||||
/// Default cadence is 24 hours per the M6 plan; the retention window itself
|
||||
/// is sourced from <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.RetentionDays"/>
|
||||
/// (default 365) so operators tune retention from a single section.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The purge actor is a daily-cadence singleton, not a hot-loop, because
|
||||
/// partition-switch I/O is metadata-only but the drop-and-rebuild dance
|
||||
/// briefly removes the <c>UX_AuditLog_EventId</c> unique index — running
|
||||
/// more often than necessary trades index-rebuild outages for marginal
|
||||
/// freshness gains. Lower this only when an operator can prove they need
|
||||
/// sub-daily purge granularity.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
|
||||
/// milliseconds without polluting the production config surface; production
|
||||
/// binds <see cref="IntervalHours"/> only.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPurgeOptions
|
||||
{
|
||||
/// <summary>Period of the purge tick in hours (default 24).</summary>
|
||||
public int IntervalHours { get; set; } = 24;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only override for finer control over the tick cadence than
|
||||
/// whole-hour resolution allows. When non-null, takes precedence over
|
||||
/// <see cref="IntervalHours"/>. Not bound from config — production
|
||||
/// config exposes <see cref="IntervalHours"/> only.
|
||||
/// </summary>
|
||||
public TimeSpan? IntervalOverride { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the effective tick interval, honouring the test override
|
||||
/// when set. Falls back to <see cref="IntervalHours"/>.
|
||||
/// </summary>
|
||||
public TimeSpan Interval =>
|
||||
IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Published on the actor-system EventStream by <see cref="AuditLogPurgeActor"/>
|
||||
/// after each successful partition switch-out. Downstream consumers (Bundle E
|
||||
/// central health collector, ops dashboards, audit trails) subscribe so a
|
||||
/// purge action is observable without the actor needing to know about any
|
||||
/// specific subscriber.
|
||||
/// </summary>
|
||||
/// <param name="MonthBoundary">
|
||||
/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e.
|
||||
/// the first instant of the purged month in UTC.
|
||||
/// </param>
|
||||
/// <param name="RowsDeleted">
|
||||
/// Approximate row count purged from the partition, sampled BEFORE the
|
||||
/// switch. Exact accounting would require a post-switch scan of the staging
|
||||
/// table, which the dance drops immediately, so this is the closest
|
||||
/// observable proxy. Zero is a valid value when the actor's enumerator
|
||||
/// included a partition the operator subsequently emptied by hand.
|
||||
/// </param>
|
||||
/// <param name="DurationMs">
|
||||
/// Wall-clock time spent inside <c>SwitchOutPartitionAsync</c> for this
|
||||
/// boundary, in milliseconds. Useful for spotting the rare slow purge
|
||||
/// without spinning up dedicated telemetry.
|
||||
/// </param>
|
||||
public sealed record AuditLogPurgedEvent(
|
||||
DateTime MonthBoundary,
|
||||
long RowsDeleted,
|
||||
long DurationMs);
|
||||
@@ -0,0 +1,61 @@
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T9) — bridges
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
|
||||
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
|
||||
/// parameter redactor stage throws and the filter has to over-redact the
|
||||
/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
|
||||
/// failure surfaces on the central health surface as
|
||||
/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Site vs central.</b> M5 Bundle C wired the SITE-side bridge
|
||||
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>),
|
||||
/// which routes increments into the site health report payload's
|
||||
/// <c>AuditRedactionFailure</c> field. That handles redactor failures on the
|
||||
/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the
|
||||
/// MIRROR bridge here so the same payload filter — when it runs on the
|
||||
/// central <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/> paths — surfaces its failures on the
|
||||
/// central dashboard rather than disappearing into a NoOp.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Registration shape.</b> Site composition roots call
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>,
|
||||
/// which overrides the binding with the site bridge. Central composition
|
||||
/// roots call <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>,
|
||||
/// which overrides with this central bridge. A node never wears both hats —
|
||||
/// site and central are distinct host roles — so the two bridges never
|
||||
/// fight over the same binding at runtime.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Why not a thin wrapper around the snapshot directly?</b> The snapshot
|
||||
/// itself <i>could</i> be the bound implementation (it already implements
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>), but a dedicated class makes
|
||||
/// the central-vs-site asymmetry explicit at the DI boundary — readers of
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>
|
||||
/// see "site → site bridge, central → central bridge", matching the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>
|
||||
/// shape one-for-one.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
private readonly AuditCentralHealthSnapshot _snapshot;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="CentralAuditRedactionFailureCounter"/> backed by the supplied snapshot.
|
||||
/// </summary>
|
||||
/// <param name="snapshot">The central health snapshot that accumulates the redaction failure count.</param>
|
||||
public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot)
|
||||
{
|
||||
_snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment();
|
||||
}
|
||||
@@ -0,0 +1,159 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central-only direct-write implementation of <see cref="ICentralAuditWriter"/>.
|
||||
/// Wraps <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> as a best-effort
|
||||
/// audit emission path for components that originate audit events ON the central
|
||||
/// node (Notification Outbox dispatch, Inbound API) — NOT for site telemetry
|
||||
/// ingest (that path is the SiteAudit → AuditLogIngestActor batched flow).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract.</b> Audit-write failures NEVER abort the user-facing
|
||||
/// action (alog.md §13). The writer catches every exception thrown by repository
|
||||
/// resolution or the insert call, logs at warning, and returns successfully.
|
||||
/// Callers may still wrap the call in their own try/catch (defensive — the writer
|
||||
/// is supposed to swallow).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Scope-per-call resolution.</b> <see cref="IAuditLogRepository"/> is a SCOPED
|
||||
/// EF Core service (registered by <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase</c>). The
|
||||
/// writer itself is registered as a singleton (so all callers share one instance),
|
||||
/// so it cannot hold a scope across calls — it opens a fresh
|
||||
/// <see cref="IServiceScope"/> per <see cref="WriteAsync"/> invocation, mirroring
|
||||
/// the per-message scope pattern used by <c>AuditLogIngestActor</c> and
|
||||
/// <c>NotificationOutboxActor</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Idempotency.</b> Persistence is via <c>InsertIfNotExistsAsync</c>, so a
|
||||
/// double-emitted event (same <see cref="AuditEvent.EventId"/>) is a silent
|
||||
/// no-op — the writer is safe to call from any number of dispatch paths.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CentralAuditWriter : ICentralAuditWriter
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<CentralAuditWriter> _logger;
|
||||
private readonly IAuditPayloadFilter _filter;
|
||||
private readonly ICentralAuditWriteFailureCounter _failureCounter;
|
||||
private readonly INodeIdentityProvider? _nodeIdentity;
|
||||
|
||||
/// <summary>
|
||||
/// Bundle C (M5-T6) — the central direct-write path used by the
|
||||
/// NotificationOutboxActor dispatch and the Inbound API middleware also
|
||||
/// needs to truncate + redact before the row hits MS SQL. The filter is
|
||||
/// optional so the M4 test composition roots that don't pass one keep
|
||||
/// working (they only ever write small payloads); production DI registers
|
||||
/// the real filter via <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// M6 Bundle E (T8) — adds the optional
|
||||
/// <see cref="ICentralAuditWriteFailureCounter"/> so a swallowed repository
|
||||
/// throw bumps the central health surface's
|
||||
/// <c>CentralAuditWriteFailures</c> counter. Defaults to a NoOp so test
|
||||
/// composition roots that don't wire the counter keep their current
|
||||
/// behaviour. SourceNode-stamping (Task 12) — adds the optional
|
||||
/// <see cref="INodeIdentityProvider"/> so central-origin rows (Notification
|
||||
/// Outbox dispatch, Inbound API) carry the writing central node's
|
||||
/// identifier when the caller hasn't already supplied one. Optional /
|
||||
/// defaulting-to-null so M4 test composition roots that don't pass a
|
||||
/// provider keep working — the caller-wins discipline means an absent
|
||||
/// provider simply leaves SourceNode at whatever the caller set (often
|
||||
/// null, which is the legacy behaviour).
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider used to open a per-call scope for the scoped repository.</param>
|
||||
/// <param name="logger">Logger for swallowed write-failure diagnostics.</param>
|
||||
/// <param name="filter">Optional payload filter for truncation and redaction; defaults to a pass-through.</param>
|
||||
/// <param name="failureCounter">Optional counter incremented on swallowed repository failures; defaults to a no-op.</param>
|
||||
/// <param name="nodeIdentity">Optional node identity provider for stamping <c>SourceNode</c> on central-origin rows.</param>
|
||||
public CentralAuditWriter(
|
||||
IServiceProvider services,
|
||||
ILogger<CentralAuditWriter> logger,
|
||||
IAuditPayloadFilter? filter = null,
|
||||
ICentralAuditWriteFailureCounter? failureCounter = null,
|
||||
INodeIdentityProvider? nodeIdentity = null)
|
||||
{
|
||||
_services = services ?? throw new ArgumentNullException(nameof(services));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
// AuditLog-008: never default to null — over-redact instead.
|
||||
// SafeDefaultAuditPayloadFilter applies HTTP header redaction with
|
||||
// hard-coded sensitive defaults so a composition root that omits the
|
||||
// real filter still scrubs Authorization / X-Api-Key / Cookie /
|
||||
// Set-Cookie before persistence.
|
||||
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
|
||||
_failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
|
||||
_nodeIdentity = nodeIdentity;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
if (evt is null)
|
||||
{
|
||||
// Defensive — a null event is a programming bug at the caller and
|
||||
// produces no meaningful audit row. Log and return.
|
||||
_logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
|
||||
// filter contract is "never throws". AuditLog-008: _filter is now
|
||||
// non-null (SafeDefaultAuditPayloadFilter fallback) so header
|
||||
// redaction always runs even in composition roots that omit the
|
||||
// real filter.
|
||||
var filtered = _filter.Apply(evt);
|
||||
|
||||
// SourceNode-stamping (Task 12): caller-provided value wins
|
||||
// (supports any future direct-write callsite that already has its
|
||||
// own node id); otherwise stamp from the local
|
||||
// INodeIdentityProvider, when one is wired. Production DI on
|
||||
// central nodes always supplies the provider; legacy test
|
||||
// composition roots that don't pass it leave SourceNode at
|
||||
// whatever the caller set (often null), preserving back-compat.
|
||||
if (filtered.SourceNode is null && _nodeIdentity?.NodeName is { } nodeName)
|
||||
{
|
||||
filtered = filtered with { SourceNode = nodeName };
|
||||
}
|
||||
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
|
||||
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Audit failure NEVER aborts the user-facing action — swallow and log.
|
||||
// M6 Bundle E (T8): also surface the failure on the central health
|
||||
// counter so a sustained audit-write outage is visible on the
|
||||
// health dashboard rather than disappearing into the log file.
|
||||
try
|
||||
{
|
||||
_failureCounter.Increment();
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Counter must NEVER throw — defence in depth. Even if a
|
||||
// misbehaving custom counter does, swallowing here keeps the
|
||||
// best-effort contract intact.
|
||||
}
|
||||
// Log the input event's identifying fields. These three (EventId,
|
||||
// Kind, Status) are immutable across the filter+stamp chain — the
|
||||
// `with` clones above touch only SourceNode and IngestedAtUtc — so
|
||||
// referencing `evt` here is intentional and equivalent to the
|
||||
// stamped record for diagnostics. If you add a field here that the
|
||||
// stamp chain DOES mutate (e.g., SourceNode), reference the latest
|
||||
// post-stamp record name instead, not `evt`.
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
|
||||
evt.EventId, evt.Kind, evt.Status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side
|
||||
/// audit-health counters: <see cref="CentralAuditWriteFailures"/> (every
|
||||
/// repository insert throw from <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/>), <see cref="AuditRedactionFailure"/>
|
||||
/// (every payload-filter redactor throw on the central path), and
|
||||
/// <see cref="SiteAuditTelemetryStalled"/> (per-site latched state from the
|
||||
/// <see cref="SiteAuditTelemetryStalledTracker"/>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Read-only contract.</b> Implementations expose a point-in-time snapshot
|
||||
/// — increments and tracker updates happen through the dedicated counter /
|
||||
/// tracker interfaces, not through this surface. Consumers (M7+ central
|
||||
/// health pages) read these properties; they never mutate.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Why a parallel surface from <see cref="ICentralHealthAggregator"/>.</b>
|
||||
/// <see cref="ICentralHealthAggregator"/> aggregates per-site
|
||||
/// <c>SiteHealthState</c> reports the SITE emits. The central audit-write
|
||||
/// failure / redaction-failure counters originate ON central (no site report
|
||||
/// carries them), so they live on a dedicated snapshot rather than being
|
||||
/// retro-fitted into a per-site state. The two surfaces will be composed at
|
||||
/// the M7 dashboard layer.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IAuditCentralHealthSnapshot
|
||||
{
|
||||
/// <summary>
|
||||
/// Count of central-side audit-write failures since process start.
|
||||
/// Incremented by every <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/> repository insert that throws.
|
||||
/// </summary>
|
||||
int CentralAuditWriteFailures { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Count of central-side payload-filter redactor over-redactions since
|
||||
/// process start. Incremented by every header / body / SQL-parameter
|
||||
/// redactor stage that throws (the filter falls back to the
|
||||
/// <c><redacted: redactor error></c> marker and never aborts the
|
||||
/// user-facing action). Sites have their own counter
|
||||
/// (<see cref="IAuditRedactionFailureCounter"/>-backed
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>) and the central
|
||||
/// composition root's binding routes ALL central redactor throws
|
||||
/// (CentralAuditWriter + AuditLogIngestActor paths) into this counter.
|
||||
/// </summary>
|
||||
int AuditRedactionFailure { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-site latched stalled state: <c>true</c> when the
|
||||
/// <see cref="SiteAuditReconciliationActor"/> has observed two
|
||||
/// consecutive non-draining cycles for that site, <c>false</c> after the
|
||||
/// first draining cycle. Sites absent from the map are interpreted as
|
||||
/// healthy (<c>Stalled=false</c> default). Snapshot is a defensive
|
||||
/// copy — readers must not mutate.
|
||||
/// </summary>
|
||||
IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled { get; }
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit
|
||||
/// writers (<see cref="CentralAuditWriter"/>, <see cref="AuditLogIngestActor"/>)
|
||||
/// every time a repository <c>InsertIfNotExistsAsync</c> throws. Mirrors the
|
||||
/// site-side <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.IAuditWriteFailureCounter"/>
|
||||
/// shape one-for-one — same one-method contract, same NoOp default, same
|
||||
/// must-never-abort-the-user-facing-action invariant.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Audit-write failures NEVER abort the user-facing action (alog.md §13) —
|
||||
/// the writer swallows the exception and surfaces the failure via this counter
|
||||
/// instead. A NoOp default is the correct safe fallback while the central
|
||||
/// health surface is being wired in; <see cref="AuditCentralHealthSnapshot"/>
|
||||
/// is the production binding that routes increments into the aggregated
|
||||
/// central health snapshot consumed by future M7+ pages.
|
||||
/// </remarks>
|
||||
public interface ICentralAuditWriteFailureCounter
|
||||
{
|
||||
/// <summary>Increment the central audit-write failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Mockable abstraction over the central-side <c>PullAuditEvents</c> gRPC
|
||||
/// client surface that <see cref="SiteAuditReconciliationActor"/> uses to
|
||||
/// fetch the next reconciliation batch from a specific site. Extracted so the
|
||||
/// actor can be unit-tested against an in-memory stub without standing up a
|
||||
/// real <c>GrpcChannel</c> per site.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The production implementation (host wiring task) wraps the auto-generated
|
||||
/// <c>SiteStreamService.SiteStreamServiceClient</c>, multiplexing one
|
||||
/// <c>GrpcChannel</c> per site keyed on
|
||||
/// <see cref="SiteEntry.GrpcEndpoint"/>. Until that wiring lands the DI
|
||||
/// composition root binds a NoOp default that returns an empty response — the
|
||||
/// reconciliation tick is still scheduled and the cursor logic still runs, so
|
||||
/// regressions in the actor itself are caught even before the real client
|
||||
/// arrives.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Implementations MUST NOT throw on transport faults that the actor can
|
||||
/// tolerate (connection refused, deadline exceeded). The actor's contract is
|
||||
/// "one site's failure doesn't sink the rest of the tick"; an exception still
|
||||
/// won't crash the actor (the per-site try/catch catches it), but returning
|
||||
/// an empty response on a known-recoverable error keeps the logs cleaner.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IPullAuditEventsClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Issues a <c>PullAuditEvents</c> RPC against the site whose endpoint
|
||||
/// is registered against <paramref name="siteId"/>. Returns the next
|
||||
/// batch of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows ordered oldest-first AND a <c>MoreAvailable</c> flag the actor
|
||||
/// uses to decide whether to fire another pull immediately.
|
||||
/// </summary>
|
||||
/// <param name="siteId">The identifier of the site to pull audit events from.</param>
|
||||
/// <param name="sinceUtc">Only events with an <c>OccurredAtUtc</c> at or after this cursor time are returned.</param>
|
||||
/// <param name="batchSize">Maximum number of events to return per call.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task<PullAuditEventsResponse> PullAsync(
|
||||
string siteId,
|
||||
DateTime sinceUtc,
|
||||
int batchSize,
|
||||
CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Enumeration surface consumed by <see cref="SiteAuditReconciliationActor"/> to
|
||||
/// discover which sites to poll on each reconciliation tick. Extracted so the
|
||||
/// actor can be unit-tested against a static list without depending on the
|
||||
/// production <c>ISiteRepository</c> + EF Core DbContext.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
|
||||
/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
|
||||
/// site's configured <c>GrpcNodeAAddress</c> (falling back to
|
||||
/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
|
||||
/// configured are silently skipped — the reconciliation pull cannot reach
|
||||
/// them, but absence of an address is a configuration decision, not a runtime
|
||||
/// error.
|
||||
/// </remarks>
|
||||
public interface ISiteEnumerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns the current set of sites the reconciliation puller should visit
|
||||
/// on the next tick. Implementations should reflect adds/removes promptly
|
||||
/// — the actor calls this once per tick.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token for the async enumeration.</param>
|
||||
Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One reconciliation target: the site identifier the actor uses as the
|
||||
/// cursor key and the gRPC endpoint <see cref="IPullAuditEventsClient"/> dials
|
||||
/// to issue the pull. Endpoint is the bare authority (e.g. <c>http://siteA:8083</c>);
|
||||
/// transport selection (TLS, keepalive, etc.) is the client's concern.
|
||||
/// </summary>
|
||||
public sealed record SiteEntry(string SiteId, string GrpcEndpoint);
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="ICentralAuditWriteFailureCounter"/> binding used when
|
||||
/// the central health surface (<see cref="AuditCentralHealthSnapshot"/>) has
|
||||
/// not been wired (test composition roots, site-only hosts that incidentally
|
||||
/// resolve a <see cref="CentralAuditWriter"/>). Drops every increment on the
|
||||
/// floor. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.NoOpAuditWriteFailureCounter"/>.
|
||||
/// </summary>
|
||||
public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// intentional no-op
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,387 @@
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
|
||||
/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
|
||||
/// known site, asks the site for any <see cref="AuditEvent"/> rows with
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> >= the site's last reconciled
|
||||
/// cursor, ingests them idempotently into the central
|
||||
/// <see cref="IAuditLogRepository"/>, and advances the cursor.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
|
||||
/// (<see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
|
||||
/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
|
||||
/// missed push (gRPC blip, central restart, site offline) is eventually
|
||||
/// repaired by central re-pulling whatever the site still has in
|
||||
/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
|
||||
/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
|
||||
/// arrivals from both paths a silent no-op.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
|
||||
/// kept in-memory for the actor's lifetime. The cluster singleton normally
|
||||
/// survives the host process; on a deliberate failover OR a singleton restart
|
||||
/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
|
||||
/// but correct — the next tick simply asks for everything the site still has,
|
||||
/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
|
||||
/// considered and rejected for M6: the cost of a write per tick outweighs the
|
||||
/// rare benefit of avoiding one over-broad pull after a restart.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
|
||||
/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
|
||||
/// — i.e. the backlog isn't draining. The actor publishes
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
|
||||
/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
|
||||
/// can flip the health metric without coupling this actor to the health
|
||||
/// collection surface today.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
|
||||
/// repository write) must NOT prevent other sites from being polled on the
|
||||
/// same tick. The per-site work runs inside its own try/catch — that
|
||||
/// per-site catch is what keeps the actor running across handler throws.
|
||||
/// The <see cref="SupervisorStrategy"/> override returns
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart
|
||||
/// semantics) and governs children only; this actor has no children today,
|
||||
/// so the override is a forward-compat placeholder. If it ever did fire,
|
||||
/// restart would reset the in-memory cursors — but as noted above that's
|
||||
/// a safe (over-pull, idempotent) recovery.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
|
||||
/// opens one DI scope per tick and reuses the same repository across all
|
||||
/// sites in that tick — one DbContext per tick mirrors the
|
||||
/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SiteAuditReconciliationActor : ReceiveActor
|
||||
{
|
||||
private readonly ISiteEnumerator _sites;
|
||||
private readonly IPullAuditEventsClient _client;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly SiteAuditReconciliationOptions _options;
|
||||
private readonly ILogger<SiteAuditReconciliationActor> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site reconciliation watermark — the highest
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
|
||||
/// tick. Asking for <c>OccurredAtUtc >= cursor</c> rather than >
|
||||
/// is the site contract (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
|
||||
/// duplicate-with-same-timestamp rows are filtered out by the idempotent
|
||||
/// repository write.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, DateTime> _cursors = new();
|
||||
|
||||
/// <summary>
|
||||
/// Per-site count of consecutive non-draining cycles. Resets to zero on the
|
||||
/// first draining (or empty) cycle.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, int> _nonDrainingCycles = new();
|
||||
|
||||
/// <summary>
|
||||
/// Per-site latched stalled state — used so the actor only publishes a
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
|
||||
/// stalled flag actually changes, not on every tick while stalled.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, bool> _stalled = new();
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-004: per-EventId retry counter for rows whose central insert
|
||||
/// threw. While a row keeps failing AND is below
|
||||
/// <see cref="MaxPermanentInsertAttempts"/>, the cursor is held back so the
|
||||
/// next reconciliation tick re-pulls and retries the row. Crossing the
|
||||
/// threshold logs Critical and permanently abandons the row (cursor
|
||||
/// advances past it) so a truly broken row cannot block all subsequent
|
||||
/// progress for a site. The counter is in-memory only — singleton restart
|
||||
/// resets it, which is safe because the cursor also resets on restart and
|
||||
/// the next tick re-pulls everything.
|
||||
/// </summary>
|
||||
private readonly Dictionary<Guid, int> _failedInsertAttempts = new();
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-004: number of consecutive central-insert failures before a row
|
||||
/// is permanently abandoned with a Critical log entry and the cursor is
|
||||
/// allowed to advance past it. Five attempts at the 5-minute default tick
|
||||
/// is ~25 min of retry budget before a stuck row stops blocking progress.
|
||||
/// </summary>
|
||||
private const int MaxPermanentInsertAttempts = 5;
|
||||
|
||||
private ICancelable? _timer;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the reconciliation actor with its dependencies and registers the tick handler.
|
||||
/// </summary>
|
||||
/// <param name="sites">Enumerates the known sites to reconcile.</param>
|
||||
/// <param name="client">Client used to pull audit events from individual sites.</param>
|
||||
/// <param name="services">Root service provider for opening a per-tick DI scope.</param>
|
||||
/// <param name="options">Reconciliation configuration (interval, page size).</param>
|
||||
/// <param name="logger">Logger for reconciliation diagnostics.</param>
|
||||
public SiteAuditReconciliationActor(
|
||||
ISiteEnumerator sites,
|
||||
IPullAuditEventsClient client,
|
||||
IServiceProvider services,
|
||||
IOptions<SiteAuditReconciliationOptions> options,
|
||||
ILogger<SiteAuditReconciliationActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(sites);
|
||||
ArgumentNullException.ThrowIfNull(client);
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_sites = sites;
|
||||
_client = client;
|
||||
_services = services;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
var interval = _options.ReconciliationInterval;
|
||||
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
initialDelay: interval,
|
||||
interval: interval,
|
||||
receiver: Self,
|
||||
message: ReconciliationTick.Instance,
|
||||
sender: Self);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_timer?.Cancel();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private async Task OnTickAsync()
|
||||
{
|
||||
// Capture EventStream BEFORE the first await. Accessing Context (and
|
||||
// therefore Context.System) after an await is unsafe because Akka's
|
||||
// ActorBase.Context throws "no active ActorContext" once the
|
||||
// continuation runs on a thread that isn't currently dispatching this
|
||||
// actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the
|
||||
// AuditLogIngestActor.OnIngestAsync Sender-capture pattern.
|
||||
var eventStream = Context.System.EventStream;
|
||||
|
||||
IReadOnlyList<SiteEntry> sites;
|
||||
try
|
||||
{
|
||||
sites = await _sites.EnumerateAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (sites.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
|
||||
// services (IAsyncDisposable DbContexts) dispose asynchronously
|
||||
// without blocking on sync Dispose() of pending connection cleanup.
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
IAuditLogRepository repository;
|
||||
try
|
||||
{
|
||||
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (var site in sites)
|
||||
{
|
||||
try
|
||||
{
|
||||
await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all per the failure-isolation invariant: one site's
|
||||
// fault must not sink the rest of the tick. The cursor for
|
||||
// the failing site is left at its previous value so the
|
||||
// next tick retries the same window.
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Reconciliation pull failed for site {SiteId}; other sites continue.",
|
||||
site.SiteId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
|
||||
/// returned rows idempotently into the central repository, and advances
|
||||
/// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
|
||||
/// observed. The brief's "saturate until backlog clears" intent is met by
|
||||
/// the natural cadence — each tick issues one pull, and a backed-up site
|
||||
/// drains across consecutive ticks. The stalled signal (two non-draining
|
||||
/// ticks in a row) surfaces when that drain isn't keeping up.
|
||||
/// </summary>
|
||||
private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream)
|
||||
{
|
||||
var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
var response = await _client.PullAsync(
|
||||
site.SiteId, since, _options.BatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var maxOccurred = since;
|
||||
var hasUnresolvedFailure = false;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var evt in response.Events)
|
||||
{
|
||||
var advanceForThisRow = false;
|
||||
try
|
||||
{
|
||||
// Idempotent repository write: duplicate EventIds (from a
|
||||
// concurrent push, or a retry of this very pull) collapse to
|
||||
// a no-op courtesy of M2 Bundle A's race-fix on
|
||||
// InsertIfNotExistsAsync.
|
||||
var ingested = evt with { IngestedAtUtc = nowUtc };
|
||||
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
|
||||
_failedInsertAttempts.Remove(evt.EventId);
|
||||
advanceForThisRow = true;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// AuditLog-004: per-row catch so one bad event does not abandon
|
||||
// the rest of the batch. Track the failure count per EventId —
|
||||
// below MaxPermanentInsertAttempts the cursor is HELD BACK so
|
||||
// the next tick re-pulls and retries; at the threshold the row
|
||||
// is permanently abandoned (LogCritical + cursor advances past)
|
||||
// to keep a truly broken row from blocking all subsequent
|
||||
// progress for the site.
|
||||
var attempts = _failedInsertAttempts.GetValueOrDefault(evt.EventId) + 1;
|
||||
_failedInsertAttempts[evt.EventId] = attempts;
|
||||
|
||||
if (attempts >= MaxPermanentInsertAttempts)
|
||||
{
|
||||
_logger.LogCritical(
|
||||
ex,
|
||||
"Permanently abandoning AuditEvent {EventId} from site {SiteId} after {Attempts} consecutive insert failures; cursor will advance past it.",
|
||||
evt.EventId,
|
||||
site.SiteId,
|
||||
attempts);
|
||||
_failedInsertAttempts.Remove(evt.EventId);
|
||||
advanceForThisRow = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId} (attempt {Attempts}/{Max}); cursor held back for retry.",
|
||||
evt.EventId,
|
||||
site.SiteId,
|
||||
attempts,
|
||||
MaxPermanentInsertAttempts);
|
||||
hasUnresolvedFailure = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
|
||||
{
|
||||
maxOccurred = evt.OccurredAtUtc;
|
||||
}
|
||||
}
|
||||
|
||||
// AuditLog-004: only advance the persisted cursor if no event in this
|
||||
// batch is still being retried. Leaving the cursor at `since` re-pulls
|
||||
// the whole batch next tick — successful rows are no-ops thanks to
|
||||
// InsertIfNotExistsAsync's idempotency, and the failing row gets
|
||||
// another attempt. Once it succeeds (or hits the permanent-abandon
|
||||
// threshold) the cursor unblocks naturally.
|
||||
_cursors[site.SiteId] = hasUnresolvedFailure ? since : maxOccurred;
|
||||
|
||||
var nonDraining = response.MoreAvailable && response.Events.Count > 0;
|
||||
UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Flips the per-site stalled flag based on whether this tick drained the
|
||||
/// queue. A "draining" cycle is one where the server reported no more rows
|
||||
/// available OR returned zero events. A "non-draining" cycle is the
|
||||
/// inverse (events returned AND <c>MoreAvailable=true</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The state machine: counter increments on each consecutive non-draining
|
||||
/// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
|
||||
/// the actor latches <c>Stalled=true</c> and publishes the transition; on
|
||||
/// any subsequent draining tick the counter resets to zero AND, if the
|
||||
/// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
|
||||
/// transitions are published — repeated ticks in the same state are
|
||||
/// silent so a downstream subscriber doesn't see a flood of redundant
|
||||
/// notifications.
|
||||
/// </remarks>
|
||||
private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream)
|
||||
{
|
||||
var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
|
||||
|
||||
if (draining)
|
||||
{
|
||||
_nonDrainingCycles[siteId] = 0;
|
||||
if (wasStalled)
|
||||
{
|
||||
_stalled[siteId] = false;
|
||||
eventStream.Publish(
|
||||
new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
|
||||
_nonDrainingCycles[siteId] = consecutive;
|
||||
|
||||
if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
|
||||
{
|
||||
_stalled[siteId] = true;
|
||||
eventStream.Publish(
|
||||
new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: 0,
|
||||
withinTimeRange: TimeSpan.Zero,
|
||||
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
|
||||
internal sealed class ReconciliationTick
|
||||
{
|
||||
public static readonly ReconciliationTick Instance = new();
|
||||
private ReconciliationTick() { }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Published on the actor system EventStream when a site's reconciliation
|
||||
/// puller transitions into or out of the "stalled" state (backlog not
|
||||
/// draining across multiple cycles). The M6 Bundle E central health collector
|
||||
/// will subscribe to this and surface
|
||||
/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
|
||||
/// </summary>
|
||||
public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central <see cref="SiteAuditReconciliationActor"/> singleton.
|
||||
/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per
|
||||
/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty
|
||||
/// AND <c>MoreAvailable=true</c> (the backlog is not draining).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is
|
||||
/// lost; it is intentionally low-frequency. Lowering
|
||||
/// <see cref="ReconciliationIntervalSeconds"/> in production trades MS SQL load for
|
||||
/// fresher self-healing — keep the default unless a deployment can prove the extra
|
||||
/// load is acceptable.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <see cref="StalledAfterNonDrainingCycles"/> = 2 because a single non-draining
|
||||
/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the
|
||||
/// stalled signal should only fire when the backlog persists across cycles, which is
|
||||
/// the symptom the central health surface is asking us to detect.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditReconciliationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Period of the reconciliation tick. Each tick visits every known site once.
|
||||
/// </summary>
|
||||
public int ReconciliationIntervalSeconds { get; set; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only override for finer control over the tick cadence than
|
||||
/// whole-second resolution allows. When non-null, takes precedence over
|
||||
/// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
|
||||
/// production config exposes <see cref="ReconciliationIntervalSeconds"/>
|
||||
/// only.
|
||||
/// </summary>
|
||||
public TimeSpan? ReconciliationIntervalOverride { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the effective tick interval, honouring the test override when
|
||||
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
|
||||
/// </summary>
|
||||
public TimeSpan ReconciliationInterval =>
|
||||
ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows requested in a single <c>PullAuditEvents</c> RPC call.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>
|
||||
/// Number of consecutive non-draining cycles (events returned AND
|
||||
/// <c>MoreAvailable=true</c>) that must accumulate for a site before the actor
|
||||
/// publishes <c>SiteAuditTelemetryStalledChanged(Stalled: true)</c> on the
|
||||
/// EventStream.
|
||||
/// </summary>
|
||||
public int StalledAfterNonDrainingCycles { get; set; } = 2;
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the
|
||||
/// actor system's EventStream for <see cref="SiteAuditTelemetryStalledChanged"/>
|
||||
/// publications and maintains a per-site latched stalled-state map readable
|
||||
/// via <see cref="Snapshot"/>. Consumed by the M6 Bundle E
|
||||
/// <see cref="AuditCentralHealthSnapshot"/> aggregator so the central health
|
||||
/// surface can surface per-site "reconciliation isn't draining" without
|
||||
/// coupling the publisher (<see cref="SiteAuditReconciliationActor"/>) to the
|
||||
/// health collection plumbing.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why an internal actor.</b> Akka.NET's <see cref="EventStream"/> only
|
||||
/// supports <see cref="IActorRef"/> subscribers — there is no callback or
|
||||
/// channel-based overload. The tracker therefore spawns a small subscriber
|
||||
/// actor that forwards each event into the shared
|
||||
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> on the actor's thread, and
|
||||
/// readers (<see cref="Snapshot"/>) take a copy off that dictionary on any
|
||||
/// thread. Mirrors the <c>DeadLetterMonitorActor</c> shape — subscribe in
|
||||
/// <see cref="ActorBase.PreStart"/>, unsubscribe in
|
||||
/// <see cref="ActorBase.PostStop"/>, which the tracker triggers via a Stop
|
||||
/// at <see cref="Dispose"/>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Per-site latching.</b> The publisher (<see cref="SiteAuditReconciliationActor"/>)
|
||||
/// only publishes on stalled-state transitions, so the dictionary is the
|
||||
/// authoritative latched state. Sites that have never published are absent
|
||||
/// from the snapshot — the consumer surface treats absence as
|
||||
/// <c>Stalled=false</c> (default healthy), the same default the reconciliation
|
||||
/// actor's own internal latch uses.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Singleton lifecycle.</b> Registered as a singleton via
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>;
|
||||
/// <see cref="Dispose"/> tears the internal subscriber down at host shutdown.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditTelemetryStalledTracker : IDisposable
|
||||
{
|
||||
private readonly EventStream _eventStream;
|
||||
private readonly ConcurrentDictionary<string, bool> _state = new();
|
||||
private readonly IActorRef? _subscriber;
|
||||
private readonly AuditCentralHealthSnapshot? _snapshot;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Construct around a bare <see cref="EventStream"/>. Intended for unit
|
||||
/// tests where the caller wants to publish events without standing up an
|
||||
/// actor system — the tracker registers a transient subscriber actor only
|
||||
/// if the supplied stream is backed by an actor system. In the bare-stream
|
||||
/// mode (no actor system) the tracker still exposes the
|
||||
/// <see cref="Snapshot"/> surface but cannot self-subscribe; production
|
||||
/// callers always go through <see cref="SiteAuditTelemetryStalledTracker(ActorSystem)"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Subscribing to <see cref="EventStream"/> requires an <see cref="IActorRef"/>,
|
||||
/// which can only be created from an <see cref="ActorSystem"/>. The bare-
|
||||
/// stream ctor therefore can NOT itself wire the subscriber — tests that
|
||||
/// want event-driven updates must use the ActorSystem ctor (or push state
|
||||
/// directly via <see cref="Apply"/>). The tests in
|
||||
/// <c>SiteAuditTelemetryStalledTrackerTests</c> use the ActorSystem ctor
|
||||
/// via Akka.TestKit so they exercise the production subscribe path.
|
||||
/// </remarks>
|
||||
/// <param name="eventStream">The actor system event stream to observe.</param>
|
||||
public SiteAuditTelemetryStalledTracker(EventStream eventStream)
|
||||
: this(eventStream, snapshot: null)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bare-stream ctor with an optional snapshot sink — the central
|
||||
/// composition root passes the singleton
|
||||
/// <see cref="AuditCentralHealthSnapshot"/> so every dictionary update
|
||||
/// also lands on the central health surface. The bare ctor still cannot
|
||||
/// subscribe (no actor system), but tests that drive the tracker via
|
||||
/// <see cref="Apply"/> get the snapshot push for free.
|
||||
/// </summary>
|
||||
/// <param name="eventStream">The actor system event stream to observe.</param>
|
||||
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
|
||||
public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot)
|
||||
{
|
||||
_eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream));
|
||||
// No subscriber actor — see the remarks on the parameterless overload.
|
||||
_subscriber = null;
|
||||
_snapshot = snapshot;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production ctor: subscribes a small internal actor to the supplied
|
||||
/// system's EventStream so every published
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> updates the latched
|
||||
/// per-site map. <see cref="Dispose"/> tears the subscriber down.
|
||||
/// </summary>
|
||||
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
|
||||
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem)
|
||||
: this(actorSystem, snapshot: null)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production ctor with a snapshot sink — every observed
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> is mirrored onto the
|
||||
/// shared <see cref="AuditCentralHealthSnapshot"/> so the central health
|
||||
/// surface sees per-site stalled state without re-reading the tracker.
|
||||
/// </summary>
|
||||
/// <param name="actorSystem">The actor system whose EventStream will be subscribed.</param>
|
||||
/// <param name="snapshot">Optional central health snapshot to mirror stalled-state changes into.</param>
|
||||
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(actorSystem);
|
||||
_eventStream = actorSystem.EventStream;
|
||||
_snapshot = snapshot;
|
||||
// Anonymous subscriber actor scoped to the system; props build it
|
||||
// with a callback into THIS tracker's Apply method so the actor's
|
||||
// single-threaded receive serialises every dictionary write.
|
||||
_subscriber = actorSystem.ActorOf(
|
||||
Props.Create(() => new StalledChangedSubscriber(this)),
|
||||
name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}");
|
||||
// Subscribe synchronously from the ctor so the subscription is in
|
||||
// place before the tracker is returned to the caller — the actor's
|
||||
// own PreStart runs asynchronously and would otherwise race the
|
||||
// first publish. EventStream.Subscribe is thread-safe.
|
||||
_eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a defensive copy of the per-site latched stalled state.
|
||||
/// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, bool> Snapshot() =>
|
||||
new Dictionary<string, bool>(_state);
|
||||
|
||||
/// <summary>
|
||||
/// Applied by the internal subscriber actor on every
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> publication. Exposed
|
||||
/// internally so tests against the bare-stream ctor can still drive the
|
||||
/// tracker, but the production path always goes through the actor.
|
||||
/// </summary>
|
||||
/// <param name="evt">The stalled-state change event to apply.</param>
|
||||
internal void Apply(SiteAuditTelemetryStalledChanged evt)
|
||||
{
|
||||
if (evt is null) return;
|
||||
_state[evt.SiteId] = evt.Stalled;
|
||||
// Mirror into the central health snapshot if wired so a reader of
|
||||
// IAuditCentralHealthSnapshot sees the same per-site state without
|
||||
// a second lookup. Snapshot is optional (test composition roots may
|
||||
// skip it) so the null-coalesce is the safe path.
|
||||
_snapshot?.ApplyStalled(evt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the tracker and tears down the internal subscriber actor.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
if (_subscriber is not null)
|
||||
{
|
||||
// Unsubscribe runs in PostStop on the subscriber actor; Stop is
|
||||
// fire-and-forget but the actor's PostStop hook is guaranteed to
|
||||
// run before its mailbox is collected.
|
||||
_subscriber.Tell(PoisonPill.Instance);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal subscriber actor — receives every
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> off the EventStream and
|
||||
/// forwards it into the parent <see cref="SiteAuditTelemetryStalledTracker"/>.
|
||||
/// Unlike <c>DeadLetterMonitorActor</c>, the subscription is registered by
|
||||
/// the tracker constructor BEFORE this actor begins processing messages so
|
||||
/// publishes that arrive between actor creation and PreStart cannot be
|
||||
/// missed. Unsubscribe still runs in <see cref="PostStop"/>.
|
||||
/// </summary>
|
||||
private sealed class StalledChangedSubscriber : ReceiveActor
|
||||
{
|
||||
private readonly SiteAuditTelemetryStalledTracker _parent;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new subscriber actor that forwards events to the given tracker.
|
||||
/// </summary>
|
||||
/// <param name="parent">The parent tracker whose <see cref="Apply"/> method will be called for each event.</param>
|
||||
public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent)
|
||||
{
|
||||
_parent = parent;
|
||||
Receive<SiteAuditTelemetryStalledChanged>(evt => _parent.Apply(evt));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged));
|
||||
base.PostStop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for Audit Log (#23). Bound from the <c>AuditLog</c> section of
|
||||
/// <c>appsettings.json</c>. Defaults reflect the design (alog.md §6, §10): an
|
||||
/// 8 KiB payload-summary cap, a 64 KiB cap on error rows, and a 365-day central
|
||||
/// retention window with monthly partition-switch purge. The default
|
||||
/// header-redact list covers HTTP auth headers; per-target overrides extend
|
||||
/// (never replace) the global redactor set.
|
||||
/// </summary>
|
||||
public sealed class AuditLogOptions
|
||||
{
|
||||
/// <summary>Default payload-summary cap in bytes (default 8 KiB).</summary>
|
||||
public int DefaultCapBytes { get; set; } = 8192;
|
||||
|
||||
/// <summary>Payload-summary cap on error rows in bytes (default 64 KiB).</summary>
|
||||
public int ErrorCapBytes { get; set; } = 65536;
|
||||
|
||||
/// <summary>HTTP headers redacted by default before persistence.</summary>
|
||||
public List<string> HeaderRedactList { get; set; } = new()
|
||||
{
|
||||
"Authorization",
|
||||
"X-Api-Key",
|
||||
"Cookie",
|
||||
"Set-Cookie",
|
||||
};
|
||||
|
||||
/// <summary>Body-content redactors applied globally (regex patterns).</summary>
|
||||
public List<string> GlobalBodyRedactors { get; set; } = new();
|
||||
|
||||
/// <summary>Per-target redaction overrides keyed by target identifier.</summary>
|
||||
public Dictionary<string, PerTargetRedactionOverride> PerTargetOverrides { get; set; } = new();
|
||||
|
||||
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
|
||||
public int RetentionDays { get; set; } = 365;
|
||||
|
||||
/// <summary>
|
||||
/// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
|
||||
/// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
|
||||
/// (default 1 MiB). The 8 KiB / 64 KiB default/error caps that apply to other channels
|
||||
/// do not apply here — inbound traffic captures verbatim up to this ceiling and only
|
||||
/// then sets <see cref="AuditEvent.PayloadTruncated"/>. See
|
||||
/// <c>docs/plans/2026-05-23-inbound-api-full-response-audit-design.md</c>.
|
||||
/// </summary>
|
||||
public int InboundMaxBytes { get; set; } = 1_048_576;
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Validates <see cref="AuditLogOptions"/> on startup. The caps drive payload
|
||||
/// truncation in the M2+ writers, so an unset/zero cap would let arbitrarily
|
||||
/// large blobs into the central <c>AuditLog</c> table. <see cref="AuditLogOptions.ErrorCapBytes"/>
|
||||
/// must be at least as large as <see cref="AuditLogOptions.DefaultCapBytes"/>
|
||||
/// because the error cap is meant to capture <em>more</em> detail than the
|
||||
/// happy-path summary, not less. <see cref="AuditLogOptions.RetentionDays"/> is
|
||||
/// bounded to <c>[30, 3650]</c> to keep purge windows sane: too short would
|
||||
/// drop in-flight investigations, too long would defeat the partition-switch
|
||||
/// purge's purpose.
|
||||
/// </summary>
|
||||
public sealed class AuditLogOptionsValidator : IValidateOptions<AuditLogOptions>
|
||||
{
|
||||
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
|
||||
public const int MinRetentionDays = 30;
|
||||
|
||||
/// <summary>Inclusive upper bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
|
||||
public const int MaxRetentionDays = 3650;
|
||||
|
||||
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.InboundMaxBytes"/> (8 KiB).</summary>
|
||||
public const int MinInboundMaxBytes = 8_192;
|
||||
|
||||
/// <summary>Inclusive upper bound for <see cref="AuditLogOptions.InboundMaxBytes"/> (16 MiB).</summary>
|
||||
public const int MaxInboundMaxBytes = 16_777_216;
|
||||
|
||||
/// <inheritdoc />
|
||||
public ValidateOptionsResult Validate(string? name, AuditLogOptions options)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
|
||||
var failures = new List<string>();
|
||||
|
||||
if (options.DefaultCapBytes <= 0)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}) " +
|
||||
"must be > 0; it drives payload-summary truncation in audit writers.");
|
||||
}
|
||||
|
||||
if (options.ErrorCapBytes < options.DefaultCapBytes)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.ErrorCapBytes)} ({options.ErrorCapBytes}) " +
|
||||
$"must be >= {nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}); " +
|
||||
"the error-row cap is intended to capture more detail than the happy-path summary.");
|
||||
}
|
||||
|
||||
if (options.RetentionDays < MinRetentionDays || options.RetentionDays > MaxRetentionDays)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.RetentionDays)} ({options.RetentionDays}) " +
|
||||
$"must be in [{MinRetentionDays}, {MaxRetentionDays}] days.");
|
||||
}
|
||||
|
||||
if (options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
|
||||
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
|
||||
}
|
||||
|
||||
return failures.Count == 0
|
||||
? ValidateOptionsResult.Success
|
||||
: ValidateOptionsResult.Fail(failures);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Per-target redaction override applied additively on top of
|
||||
/// <see cref="AuditLogOptions.GlobalBodyRedactors"/> and the
|
||||
/// <see cref="AuditLogOptions.DefaultCapBytes"/> / <see cref="AuditLogOptions.ErrorCapBytes"/>
|
||||
/// caps. Targets are identified by the script-facing external-system /
|
||||
/// database / notification-list / inbound-API-key name.
|
||||
/// </summary>
|
||||
public sealed class PerTargetRedactionOverride
|
||||
{
|
||||
/// <summary>Optional payload cap override (bytes); null inherits the global cap.</summary>
|
||||
public int? CapBytes { get; set; }
|
||||
|
||||
/// <summary>Additional body redactor regex patterns (appended to the global list).</summary>
|
||||
public List<string>? AdditionalBodyRedactors { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Opt-in SQL parameter redaction: case-insensitive regex matched against
|
||||
/// each SQL parameter NAME in the M4 <c>AuditingDbCommand</c> RequestSummary
|
||||
/// JSON (<c>{"sql":"...","parameters":{"@name":"value", ...}}</c>); values
|
||||
/// whose name matches are replaced with <c><redacted></c>. Null (the
|
||||
/// default) means parameter values are captured verbatim. Only applied to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditChannel.DbOutbound"/>
|
||||
/// rows.
|
||||
/// </summary>
|
||||
public string? RedactSqlParamsMatching { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,587 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Text;
|
||||
using System.Text.Encodings.Web;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Nodes;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditPayloadFilter"/>. Bundle A established the
|
||||
/// truncation backbone; Bundle B chains HTTP header redaction (M5-T3) BEFORE
|
||||
/// truncation so redactors operate on the full payload and the cap then trims
|
||||
/// the redacted result.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Uses <see cref="IOptionsMonitor{TOptions}"/> (not <see cref="IOptions{TOptions}"/>)
|
||||
/// so the M5-T8 hot-reload path sees fresh values without re-resolving the
|
||||
/// singleton. <see cref="Apply"/> reads <see cref="IOptionsMonitor{T}.CurrentValue"/>
|
||||
/// on every call, and the regex cache is keyed by pattern string — patterns
|
||||
/// added via a live config change compile on first use of the next event;
|
||||
/// patterns removed simply stop being looked up. No <c>OnChange</c> subscription
|
||||
/// or explicit cache invalidation is required (the
|
||||
/// <c>AuditLogOptionsBindingTests</c> fixture in <c>ZB.MOM.WW.ScadaBridge.AuditLog.Tests</c>
|
||||
/// pins this behaviour).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// "Error row" = <see cref="AuditEvent.Status"/> NOT IN (<c>Delivered</c>,
|
||||
/// <c>Submitted</c>, <c>Forwarded</c>) — every other status, including the
|
||||
/// non-terminal <c>Attempted</c>, the parked/discarded terminals, and the
|
||||
/// short-circuit <c>Skipped</c>, receives the larger error cap so a verbose
|
||||
/// error body survives.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Apply MUST NOT throw — on internal failure the filter over-redacts by
|
||||
/// returning the input with <see cref="AuditEvent.PayloadTruncated"/> set and
|
||||
/// increments the <c>AuditRedactionFailure</c> health metric via the injected
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>. Each redactor stage runs in
|
||||
/// its own try/catch — a failure in (say) the header redactor still lets the
|
||||
/// SQL parameter redactor and the truncator run on the remaining fields.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Stage order (each runs on every applicable field):
|
||||
/// header redaction → body regex redaction → truncation. The SQL-parameter
|
||||
/// stage piggybacks on the body-redactor path; both run BEFORE truncation so
|
||||
/// the cap trims the redacted result, never bytes the redactor intended to
|
||||
/// hide.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class DefaultAuditPayloadFilter : IAuditPayloadFilter
|
||||
{
|
||||
private const string RedactedMarker = "<redacted>";
|
||||
private const string RedactorErrorMarker = "<redacted: redactor error>";
|
||||
|
||||
/// <summary>
|
||||
/// Per-match regex timeout. Catastrophic-backtracking patterns trip a
|
||||
/// <see cref="RegexMatchTimeoutException"/> when a single match takes
|
||||
/// longer than this; the offending field is then over-redacted with
|
||||
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
|
||||
/// 50 ms is generous for normal patterns yet short enough that the
|
||||
/// audit hot-path isn't held up by a misconfigured regex.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.FromMilliseconds(50);
|
||||
|
||||
/// <summary>
|
||||
/// JSON serializer options used to re-emit redacted summaries. The
|
||||
/// UnsafeRelaxedJsonEscaping encoder is required so the redaction marker
|
||||
/// (which contains <c><</c> / <c>></c>) survives unescaped — the
|
||||
/// header-redaction tests grep for the literal marker, and the downstream
|
||||
/// UI / log readers would rather see <c><redacted></c> than
|
||||
/// <c><redacted></c>. The summaries are persisted to the audit
|
||||
/// table and rendered in trusted-internal contexts only, so the relaxed
|
||||
/// HTML-escaping rules do not introduce an XSS surface.
|
||||
/// </summary>
|
||||
private static readonly JsonSerializerOptions RedactedSummaryJsonOptions = new()
|
||||
{
|
||||
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
|
||||
};
|
||||
|
||||
private readonly IOptionsMonitor<AuditLogOptions> _options;
|
||||
private readonly ILogger<DefaultAuditPayloadFilter> _logger;
|
||||
private readonly IAuditRedactionFailureCounter _failureCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Compiled-regex cache keyed by pattern string. Lazy population: each
|
||||
/// pattern is compiled on first use and cached forever (the entry's
|
||||
/// <see cref="CompiledRegex"/> carries either the working <see cref="Regex"/>
|
||||
/// or a sentinel marking the pattern as invalid so we don't retry the
|
||||
/// failing compile on every call). ConcurrentDictionary is the right
|
||||
/// thread-safety primitive here because the filter is a DI singleton
|
||||
/// shared across the audit hot-path.
|
||||
/// </summary>
|
||||
private readonly ConcurrentDictionary<string, CompiledRegex> _regexCache = new();
|
||||
|
||||
/// <summary>
|
||||
/// Primary constructor used by DI — pulls the optional redaction-failure
|
||||
/// counter from the container; a NoOp default is registered in
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// </summary>
|
||||
/// <param name="options">Live-reloadable audit log options.</param>
|
||||
/// <param name="logger">Logger for redaction diagnostics.</param>
|
||||
/// <param name="failureCounter">Optional counter incremented when a redaction operation fails; defaults to a no-op.</param>
|
||||
public DefaultAuditPayloadFilter(
|
||||
IOptionsMonitor<AuditLogOptions> options,
|
||||
ILogger<DefaultAuditPayloadFilter> logger,
|
||||
IAuditRedactionFailureCounter? failureCounter = null)
|
||||
{
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_failureCounter = failureCounter ?? new NoOpAuditRedactionFailureCounter();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public AuditEvent Apply(AuditEvent rawEvent)
|
||||
{
|
||||
try
|
||||
{
|
||||
var opts = _options.CurrentValue;
|
||||
// Inbound API gets a dedicated, larger ceiling — request/response bodies are
|
||||
// captured verbatim up to InboundMaxBytes (default 1 MiB) so support can
|
||||
// replay exactly what the caller sent and what we returned. Other channels
|
||||
// keep the global 8 KiB / 64 KiB policy.
|
||||
// See docs/plans/2026-05-23-inbound-api-full-response-audit-design.md.
|
||||
var cap = rawEvent.Channel == AuditChannel.ApiInbound
|
||||
? opts.InboundMaxBytes
|
||||
: (IsErrorStatus(rawEvent.Status) ? opts.ErrorCapBytes : opts.DefaultCapBytes);
|
||||
|
||||
// --- Header-redaction stage (runs BEFORE truncation) ----------
|
||||
var request = RedactHeaders(rawEvent.RequestSummary, opts.HeaderRedactList);
|
||||
var response = RedactHeaders(rawEvent.ResponseSummary, opts.HeaderRedactList);
|
||||
var errorDetail = rawEvent.ErrorDetail;
|
||||
var extra = rawEvent.Extra;
|
||||
|
||||
// --- Body-regex stage (also runs BEFORE truncation) -----------
|
||||
// Resolves the active regex set per event so per-target overrides
|
||||
// bound to AuditEvent.Target are picked up; effectively a no-op
|
||||
// when neither GlobalBodyRedactors nor the per-target additions
|
||||
// are configured.
|
||||
var bodyRegexes = ResolveBodyRegexes(opts, rawEvent.Target);
|
||||
if (bodyRegexes.Count > 0)
|
||||
{
|
||||
request = RedactBody(request, bodyRegexes);
|
||||
response = RedactBody(response, bodyRegexes);
|
||||
errorDetail = RedactBody(errorDetail, bodyRegexes);
|
||||
extra = RedactBody(extra, bodyRegexes);
|
||||
}
|
||||
|
||||
// --- SQL parameter redaction stage (DbOutbound only) ----------
|
||||
// Parses the M4 AuditingDbCommand RequestSummary shape
|
||||
// {"sql":"...","parameters":{...}} and redacts parameter VALUES
|
||||
// whose NAME matches the per-connection regex. Opt-in: no
|
||||
// PerTargetOverrides[connectionName].RedactSqlParamsMatching =>
|
||||
// no-op. Channel-guarded so the same regex can never accidentally
|
||||
// touch an ApiOutbound row.
|
||||
if (rawEvent.Channel == AuditChannel.DbOutbound
|
||||
&& TryGetSqlParamRedactor(opts, rawEvent.Target, out var sqlParamRegex))
|
||||
{
|
||||
request = RedactSqlParameters(request, sqlParamRegex!);
|
||||
}
|
||||
|
||||
// --- Truncation stage -----------------------------------------
|
||||
var truncated = false;
|
||||
request = TruncateField(request, cap, ref truncated);
|
||||
response = TruncateField(response, cap, ref truncated);
|
||||
errorDetail = TruncateField(errorDetail, cap, ref truncated);
|
||||
extra = TruncateField(extra, cap, ref truncated);
|
||||
|
||||
return rawEvent with
|
||||
{
|
||||
RequestSummary = request,
|
||||
ResponseSummary = response,
|
||||
ErrorDetail = errorDetail,
|
||||
Extra = extra,
|
||||
PayloadTruncated = rawEvent.PayloadTruncated || truncated,
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Audit is best-effort: over-redact rather than fail the caller.
|
||||
// The per-stage try/catches above already handle redactor faults
|
||||
// and increment the counter; this catch covers any unexpected
|
||||
// surprise in the surrounding orchestration code.
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Payload filter failed; returning raw event with PayloadTruncated=true");
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return rawEvent with { PayloadTruncated = true };
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse <paramref name="json"/> as the documented
|
||||
/// <c>{"headers": {...}, "body": ...}</c> shape and replace values whose
|
||||
/// header NAME (case-insensitive) is in <paramref name="redactList"/> with
|
||||
/// <see cref="RedactedMarker"/>. Re-serialises and returns the result.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// No-op pass-through for inputs that aren't JSON-shaped — emitters that
|
||||
/// have not yet adopted the convention (the M2 site emitters today, which
|
||||
/// leave RequestSummary null on outbound API calls) get a transparent
|
||||
/// pass. If the redactor itself throws, we over-redact the whole field
|
||||
/// with <see cref="RedactorErrorMarker"/> and bump the failure counter.
|
||||
/// </remarks>
|
||||
private string? RedactHeaders(string? json, IList<string> redactList)
|
||||
{
|
||||
if (json is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Cheap structural pre-check: only attempt JSON parsing when the input
|
||||
// actually looks like a JSON object. Saves the JsonDocument allocation
|
||||
// on the (very common) non-JSON ErrorDetail / Extra fields.
|
||||
var trimmed = json.AsSpan().TrimStart();
|
||||
if (trimmed.Length == 0 || trimmed[0] != '{')
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
JsonNode? root;
|
||||
try
|
||||
{
|
||||
root = JsonNode.Parse(json);
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
// Not parseable JSON — leave the field alone (no error, no
|
||||
// redaction). Emitters not yet using the documented shape get
|
||||
// a transparent pass; Bundle C will update them.
|
||||
return json;
|
||||
}
|
||||
|
||||
if (root is not JsonObject obj || obj["headers"] is not JsonObject headers)
|
||||
{
|
||||
// No "headers" object at the top level — nothing to redact.
|
||||
return json;
|
||||
}
|
||||
|
||||
// Build a case-insensitive lookup of the redact list so we can do
|
||||
// one O(1) check per header name without an inner Any() loop.
|
||||
var redactSet = new HashSet<string>(redactList, StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Take a snapshot of names first — we cannot mutate while
|
||||
// enumerating the JsonObject.
|
||||
var names = new List<string>(headers.Count);
|
||||
foreach (var kvp in headers)
|
||||
{
|
||||
names.Add(kvp.Key);
|
||||
}
|
||||
foreach (var name in names)
|
||||
{
|
||||
if (redactSet.Contains(name))
|
||||
{
|
||||
headers[name] = JsonValue.Create(RedactedMarker);
|
||||
}
|
||||
}
|
||||
|
||||
return obj.ToJsonString(RedactedSummaryJsonOptions);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Header redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Combine the global and per-target body-redactor lists for a single
|
||||
/// event, returning the compiled-regex set to apply. Patterns that failed
|
||||
/// compilation are silently skipped — the compile-time failure was logged
|
||||
/// once on first encounter; we never let one bad pattern starve the rest.
|
||||
/// </summary>
|
||||
private IReadOnlyList<Regex> ResolveBodyRegexes(AuditLogOptions opts, string? target)
|
||||
{
|
||||
var hasGlobal = opts.GlobalBodyRedactors is { Count: > 0 };
|
||||
var perTargetAdditions = (target != null
|
||||
&& opts.PerTargetOverrides.TryGetValue(target, out var over)
|
||||
&& over.AdditionalBodyRedactors is { Count: > 0 })
|
||||
? over.AdditionalBodyRedactors
|
||||
: null;
|
||||
|
||||
if (!hasGlobal && perTargetAdditions == null)
|
||||
{
|
||||
return Array.Empty<Regex>();
|
||||
}
|
||||
|
||||
var result = new List<Regex>();
|
||||
if (hasGlobal)
|
||||
{
|
||||
foreach (var pattern in opts.GlobalBodyRedactors)
|
||||
{
|
||||
if (TryGetCompiledRegex(pattern, out var rx))
|
||||
{
|
||||
result.Add(rx!);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (perTargetAdditions != null)
|
||||
{
|
||||
foreach (var pattern in perTargetAdditions)
|
||||
{
|
||||
if (TryGetCompiledRegex(pattern, out var rx))
|
||||
{
|
||||
result.Add(rx!);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolve a compiled regex from the cache, compiling it on first use.
|
||||
/// Returns <c>false</c> for patterns that are invalid OR whose compile
|
||||
/// took longer than 100 ms (the spec calls catastrophic-backtracking
|
||||
/// guesses at compile time "invalid"); the failure is logged once and
|
||||
/// the sentinel cache entry prevents repeat compile attempts.
|
||||
/// </summary>
|
||||
private bool TryGetCompiledRegex(string pattern, out Regex? regex)
|
||||
{
|
||||
var entry = _regexCache.GetOrAdd(pattern, CompileRegex);
|
||||
regex = entry.Regex;
|
||||
return entry.Regex != null;
|
||||
}
|
||||
|
||||
private CompiledRegex CompileRegex(string pattern)
|
||||
{
|
||||
try
|
||||
{
|
||||
var swStart = System.Diagnostics.Stopwatch.GetTimestamp();
|
||||
var rx = new Regex(pattern, RegexOptions.Compiled, RegexMatchTimeout);
|
||||
var elapsedMs = (System.Diagnostics.Stopwatch.GetTimestamp() - swStart)
|
||||
* 1000d / System.Diagnostics.Stopwatch.Frequency;
|
||||
if (elapsedMs > 100)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Body redactor pattern compiled in {Elapsed}ms (> 100ms cap); rejecting '{Pattern}'",
|
||||
elapsedMs, pattern);
|
||||
return CompiledRegex.Invalid;
|
||||
}
|
||||
return new CompiledRegex(rx);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Body redactor pattern '{Pattern}' failed to compile; skipping",
|
||||
pattern);
|
||||
return CompiledRegex.Invalid;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Apply each compiled body-redactor regex to <paramref name="value"/> in
|
||||
/// turn, replacing every match with <see cref="RedactedMarker"/>. If any
|
||||
/// single regex match throws (most commonly
|
||||
/// <see cref="RegexMatchTimeoutException"/>) the field is over-redacted
|
||||
/// with <see cref="RedactorErrorMarker"/> and the failure counter is
|
||||
/// incremented — the user-facing action is never aborted.
|
||||
/// </summary>
|
||||
private string? RedactBody(string? value, IReadOnlyList<Regex> regexes)
|
||||
{
|
||||
if (value is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
var current = value;
|
||||
foreach (var rx in regexes)
|
||||
{
|
||||
try
|
||||
{
|
||||
current = rx.Replace(current, RedactedMarker);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Body redactor '{Pattern}' faulted; over-redacting field with '{Marker}'",
|
||||
rx.ToString(), RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolve the per-connection SQL parameter redaction regex for the given
|
||||
/// DbOutbound event target. Target shape (M4 AuditingDbCommand): the
|
||||
/// connection name optionally followed by <c>.<sql-snippet></c> for
|
||||
/// disambiguation; the per-target dictionary is keyed by the connection
|
||||
/// name alone, so we strip the snippet suffix before lookup. Patterns are
|
||||
/// compiled with case-insensitive matching to match the documented
|
||||
/// behaviour.
|
||||
/// </summary>
|
||||
private bool TryGetSqlParamRedactor(AuditLogOptions opts, string? target, out Regex? regex)
|
||||
{
|
||||
regex = null;
|
||||
if (string.IsNullOrEmpty(target))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var dot = target.IndexOf('.');
|
||||
var connectionKey = dot < 0 ? target : target[..dot];
|
||||
|
||||
if (!opts.PerTargetOverrides.TryGetValue(connectionKey, out var over)
|
||||
|| string.IsNullOrEmpty(over.RedactSqlParamsMatching))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Force case-insensitivity per the spec — even if the operator wrote
|
||||
// the pattern without an IgnoreCase flag. The compile cache key folds
|
||||
// the option to keep the entries unambiguous.
|
||||
var cacheKey = "(?i)" + over.RedactSqlParamsMatching;
|
||||
if (!TryGetCompiledRegex(cacheKey, out regex))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Walk the M4 <c>{"sql":"...","parameters":{...}}</c> RequestSummary
|
||||
/// shape; for each parameter whose NAME matches
|
||||
/// <paramref name="paramNameRegex"/>, replace its value with
|
||||
/// <see cref="RedactedMarker"/>. Re-serialise.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// No-op pass-through when the input isn't parseable JSON, isn't a JSON
|
||||
/// object, or doesn't carry a top-level <c>"parameters"</c> object. On
|
||||
/// any unexpected fault the field is over-redacted with
|
||||
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
|
||||
/// </remarks>
|
||||
private string? RedactSqlParameters(string? json, Regex paramNameRegex)
|
||||
{
|
||||
if (json is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = json.AsSpan().TrimStart();
|
||||
if (trimmed.Length == 0 || trimmed[0] != '{')
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
JsonNode? root;
|
||||
try
|
||||
{
|
||||
root = JsonNode.Parse(json);
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
if (root is not JsonObject obj || obj["parameters"] is not JsonObject parameters)
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
// Snapshot the names — mutating during enumeration is unsupported.
|
||||
var names = new List<string>(parameters.Count);
|
||||
foreach (var kvp in parameters)
|
||||
{
|
||||
names.Add(kvp.Key);
|
||||
}
|
||||
var anyChanged = false;
|
||||
foreach (var name in names)
|
||||
{
|
||||
bool matched;
|
||||
try
|
||||
{
|
||||
matched = paramNameRegex.IsMatch(name);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
if (matched)
|
||||
{
|
||||
parameters[name] = JsonValue.Create(RedactedMarker);
|
||||
anyChanged = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid re-serialising (which would normalise whitespace / order)
|
||||
// when no parameter matched — keeps the on-disk row byte-identical
|
||||
// to the emitter's output on the no-match path.
|
||||
return anyChanged ? obj.ToJsonString(RedactedSummaryJsonOptions) : json;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
|
||||
private static string? TruncateField(string? value, int cap, ref bool truncated)
|
||||
{
|
||||
if (value is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
var result = TruncateUtf8(value, cap);
|
||||
if (result.Length != value.Length)
|
||||
{
|
||||
truncated = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// UTF-8 byte-safe truncation. Encodes the input to UTF-8, walks back from
|
||||
/// the cap position until the byte is NOT a continuation byte
|
||||
/// (<c>byte & 0xC0 == 0x80</c>), and decodes the resulting prefix —
|
||||
/// guaranteeing the returned string never splits a multi-byte sequence.
|
||||
/// </summary>
|
||||
private static string TruncateUtf8(string value, int capBytes)
|
||||
{
|
||||
if (string.IsNullOrEmpty(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
var bytes = Encoding.UTF8.GetBytes(value);
|
||||
if (bytes.Length <= capBytes)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
var boundary = capBytes;
|
||||
while (boundary > 0 && (bytes[boundary] & 0xC0) == 0x80)
|
||||
{
|
||||
boundary--;
|
||||
}
|
||||
return Encoding.UTF8.GetString(bytes, 0, boundary);
|
||||
}
|
||||
|
||||
private static bool IsErrorStatus(AuditStatus status) => status switch
|
||||
{
|
||||
AuditStatus.Delivered or AuditStatus.Submitted or AuditStatus.Forwarded => false,
|
||||
_ => true,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Cache entry for a body-redactor pattern. Carries the working
|
||||
/// <see cref="Regex"/> on the success path, or the
|
||||
/// <see cref="Invalid"/> sentinel for patterns that failed to compile
|
||||
/// (or exceeded the 100 ms compile budget). The sentinel lets us skip
|
||||
/// repeat compile attempts on every event without re-throwing on the
|
||||
/// hot-path.
|
||||
/// </summary>
|
||||
private readonly struct CompiledRegex
|
||||
{
|
||||
public static readonly CompiledRegex Invalid = new(null);
|
||||
|
||||
/// <summary>Gets the compiled <see cref="System.Text.RegularExpressions.Regex"/>, or <c>null</c> when the pattern was invalid.</summary>
|
||||
public Regex? Regex { get; }
|
||||
|
||||
/// <summary>Initializes a new <see cref="CompiledRegex"/> wrapping the given compiled regex instance.</summary>
|
||||
/// <param name="regex">The pre-compiled regex, or <c>null</c> to represent an invalid pattern.</param>
|
||||
public CompiledRegex(Regex? regex) => Regex = regex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Filters an <see cref="AuditEvent"/> between construction and persistence —
|
||||
/// truncates oversized payload fields, applies header/body/SQL-parameter
|
||||
/// redaction, sets <see cref="AuditEvent.PayloadTruncated"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Pure function: returns a filtered COPY of the input via <c>with</c>
|
||||
/// expressions; never throws (over-redacts on internal failure and increments
|
||||
/// the <c>AuditRedactionFailure</c> health metric).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Wired in M5 between event construction and the writer chain
|
||||
/// (<c>FallbackAuditWriter.WriteAsync</c>, <c>CentralAuditWriter.WriteAsync</c>,
|
||||
/// and the <c>AuditLogIngestActor</c> handlers).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IAuditPayloadFilter
|
||||
{
|
||||
/// <summary>
|
||||
/// Apply the configured truncation + redaction policy to <paramref name="rawEvent"/>
|
||||
/// and return a filtered copy. MUST NOT throw — on internal failure, over-redact
|
||||
/// and surface the failure via the audit-redaction-failure health metric.
|
||||
/// </summary>
|
||||
/// <param name="rawEvent">The unfiltered audit event to process.</param>
|
||||
AuditEvent Apply(AuditEvent rawEvent);
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Counter sink invoked by <see cref="DefaultAuditPayloadFilter"/> every time
|
||||
/// a redactor (header / body regex / SQL parameter) throws and the filter has
|
||||
/// to over-redact the offending field with the
|
||||
/// <c><redacted: redactor error></c> marker. Bundle C bridges this into
|
||||
/// the Site Health Monitoring report payload as <c>AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Redaction failures must NEVER abort the user-facing action (alog.md §7) —
|
||||
/// the filter over-redacts the field and surfaces the failure via this counter
|
||||
/// instead. A NoOp default is the correct safe fallback while the health
|
||||
/// metric is being wired in.
|
||||
/// </remarks>
|
||||
public interface IAuditRedactionFailureCounter
|
||||
{
|
||||
/// <summary>Increment the audit-redaction failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditRedactionFailureCounter"/> binding used when the
|
||||
/// Site Health Monitoring bridge has not been wired yet. Bundle C replaces
|
||||
/// this registration with the real counter that surfaces in the site health
|
||||
/// report payload as <c>AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
public sealed class NoOpAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// Intentionally empty — Bundle C overrides this binding with the real
|
||||
// health-metric counter.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-008: minimal always-safe fallback filter used by the writer chain
|
||||
/// when no <see cref="IAuditPayloadFilter"/> is injected (test composition
|
||||
/// roots, future composition roots that bypass <c>AddAuditLog</c>). Performs
|
||||
/// HTTP header redaction for the always-sensitive defaults
|
||||
/// (Authorization, X-Api-Key, Cookie, Set-Cookie) so a fixture that wires a
|
||||
/// real <see cref="AuditEvent.RequestSummary"/> never persists those headers
|
||||
/// in cleartext. Does NOT perform body-regex redaction, SQL-parameter
|
||||
/// redaction, or truncation — those stages need
|
||||
/// <see cref="DefaultAuditPayloadFilter"/> with live options. The contract is:
|
||||
/// over-redact safely, never throw, never miss a header that's on the
|
||||
/// default sensitive list.
|
||||
/// </summary>
|
||||
public sealed class SafeDefaultAuditPayloadFilter : IAuditPayloadFilter
|
||||
{
|
||||
/// <summary>Singleton instance — the filter is stateless and side-effect-free.</summary>
|
||||
public static SafeDefaultAuditPayloadFilter Instance { get; } = new SafeDefaultAuditPayloadFilter();
|
||||
|
||||
private static readonly string[] DefaultHeaderRedactList =
|
||||
{
|
||||
"Authorization",
|
||||
"X-Api-Key",
|
||||
"Cookie",
|
||||
"Set-Cookie",
|
||||
};
|
||||
|
||||
private static readonly Regex HeaderRegex = new(
|
||||
@"(?<name>[A-Za-z][A-Za-z0-9\-_]*)\s*:\s*(?<value>[^\r\n]*)",
|
||||
RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||||
|
||||
private SafeDefaultAuditPayloadFilter() { }
|
||||
|
||||
/// <inheritdoc />
|
||||
public AuditEvent Apply(AuditEvent rawEvent)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(rawEvent);
|
||||
try
|
||||
{
|
||||
return rawEvent with
|
||||
{
|
||||
RequestSummary = RedactHeaders(rawEvent.RequestSummary),
|
||||
ResponseSummary = RedactHeaders(rawEvent.ResponseSummary),
|
||||
};
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Over-redact: drop both summaries entirely so a malformed parse
|
||||
// path never leaks the original. The contract is "never throw."
|
||||
return rawEvent with
|
||||
{
|
||||
RequestSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
|
||||
ResponseSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private static string? RedactHeaders(string? summary)
|
||||
{
|
||||
if (string.IsNullOrEmpty(summary)) return summary;
|
||||
|
||||
return HeaderRegex.Replace(summary, m =>
|
||||
{
|
||||
var name = m.Groups["name"].Value;
|
||||
foreach (var sensitive in DefaultHeaderRedactList)
|
||||
{
|
||||
if (string.Equals(name, sensitive, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return $"{name}: [REDACTED]";
|
||||
}
|
||||
}
|
||||
return m.Value;
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,362 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog;
|
||||
|
||||
/// <summary>
|
||||
/// Composition root for the Audit Log (#23) component.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// M1 registered <see cref="AuditLogOptions"/> + the validator. M2 Bundle E
|
||||
/// extends the surface with the site-side writer chain
|
||||
/// (<see cref="SqliteAuditWriter"/> + <see cref="RingBufferFallback"/> +
|
||||
/// <see cref="FallbackAuditWriter"/>) and the telemetry collaborators
|
||||
/// (<see cref="ISiteAuditQueue"/>, <see cref="ISiteStreamAuditClient"/>,
|
||||
/// <see cref="IAuditWriteFailureCounter"/>, <see cref="SiteAuditTelemetryOptions"/>,
|
||||
/// <see cref="SqliteAuditWriterOptions"/>).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call
|
||||
/// Audit (#22). <c>IAuditLogRepository</c> is registered by
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.ServiceCollectionExtensions.AddConfigurationDatabase</c>,
|
||||
/// so the caller (the Host on the central node) must also call that.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>Configuration section bound to <see cref="AuditLogOptions"/>.</summary>
|
||||
public const string ConfigSectionName = "AuditLog";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="SqliteAuditWriterOptions"/>.</summary>
|
||||
public const string SiteWriterSectionName = "AuditLog:SiteWriter";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="SiteAuditTelemetryOptions"/>.</summary>
|
||||
public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="AuditLogPartitionMaintenanceOptions"/>.</summary>
|
||||
public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance";
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Audit Log (#23) component services: options, the site
|
||||
/// SQLite writer chain (primary + ring fallback + failure-counter sink),
|
||||
/// and the site-→central telemetry collaborators. Idempotent re-registration
|
||||
/// is not supported; call this exactly once per <see cref="IServiceCollection"/>.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
/// <param name="config">Application configuration used to bind <see cref="AuditLogOptions"/> and related options sections.</param>
|
||||
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
|
||||
public static IServiceCollection AddAuditLog(this IServiceCollection services, IConfiguration config)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(config);
|
||||
|
||||
// M1: top-level AuditLogOptions + validator (redaction policy, payload caps, etc.).
|
||||
services.AddOptions<AuditLogOptions>()
|
||||
.Bind(config.GetSection(ConfigSectionName))
|
||||
.ValidateOnStart();
|
||||
services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();
|
||||
|
||||
// M5 Bundle A: payload filter — truncates oversized RequestSummary /
|
||||
// ResponseSummary / ErrorDetail / Extra fields between event
|
||||
// construction and persistence. Bundle B layers header / body /
|
||||
// SQL-parameter redaction onto the same singleton; Bundle C wires it
|
||||
// into the FallbackAuditWriter / CentralAuditWriter / IngestActor
|
||||
// paths. Singleton — the filter is stateless and the IOptionsMonitor
|
||||
// dependency picks up M5-T8 hot reloads on its own.
|
||||
services.AddSingleton<IAuditPayloadFilter, DefaultAuditPayloadFilter>();
|
||||
|
||||
// M5 Bundle B: per-stage redactor-failure counter. NoOp default;
|
||||
// Bundle C replaces this binding with the Site Health Monitoring
|
||||
// bridge that surfaces failures as AuditRedactionFailure on the site
|
||||
// health report.
|
||||
services.TryAddSingleton<IAuditRedactionFailureCounter, NoOpAuditRedactionFailureCounter>();
|
||||
|
||||
// M2 Bundle E: site writer + telemetry options bindings.
|
||||
// BindConfiguration is not used because the configuration root supplied
|
||||
// by the caller may not be the application root — we go through the
|
||||
// section explicitly so a partial IConfiguration (e.g. a test stub
|
||||
// anchored on the AuditLog section's parent) still works.
|
||||
services.AddOptions<SqliteAuditWriterOptions>()
|
||||
.Bind(config.GetSection(SiteWriterSectionName));
|
||||
services.AddOptions<SiteAuditTelemetryOptions>()
|
||||
.Bind(config.GetSection(SiteTelemetrySectionName));
|
||||
|
||||
// SqliteAuditWriter is a singleton with a single owned SqliteConnection
|
||||
// and a background writer Task; multiple instances would race on the
|
||||
// same file. Registered concretely so the ISiteAuditQueue + IAuditWriter
|
||||
// forwards below resolve to the same instance — the actor must observe
|
||||
// the writes made via the hot-path interface.
|
||||
services.AddSingleton<SqliteAuditWriter>();
|
||||
services.AddSingleton<ISiteAuditQueue>(sp => sp.GetRequiredService<SqliteAuditWriter>());
|
||||
|
||||
// RingBufferFallback: drop-oldest in-memory ring used by
|
||||
// FallbackAuditWriter when the primary SQLite writer throws. Default
|
||||
// capacity is fine for M2 (1024).
|
||||
services.AddSingleton<RingBufferFallback>();
|
||||
|
||||
// IAuditWriteFailureCounter: NoOp default. Bundle G overrides this
|
||||
// binding with the real Site Health Monitoring counter. Registered
|
||||
// before FallbackAuditWriter so the factory can resolve it.
|
||||
services.AddSingleton<IAuditWriteFailureCounter, NoOpAuditWriteFailureCounter>();
|
||||
|
||||
// The script-thread surface is FallbackAuditWriter (primary + ring +
|
||||
// counter), not the raw SqliteAuditWriter — primary failures must NEVER
|
||||
// abort the user-facing action.
|
||||
// Bundle C (M5-T6): the IAuditPayloadFilter singleton above is wired
|
||||
// through the factory so every event written through this surface is
|
||||
// truncated + redacted before it hits SQLite (and the ring on
|
||||
// failure).
|
||||
services.AddSingleton<IAuditWriter>(sp => new FallbackAuditWriter(
|
||||
primary: sp.GetRequiredService<SqliteAuditWriter>(),
|
||||
ring: sp.GetRequiredService<RingBufferFallback>(),
|
||||
failureCounter: sp.GetRequiredService<IAuditWriteFailureCounter>(),
|
||||
logger: sp.GetRequiredService<ILogger<FallbackAuditWriter>>(),
|
||||
filter: sp.GetRequiredService<IAuditPayloadFilter>()));
|
||||
|
||||
// ISiteStreamAuditClient: NoOp default. This binding remains correct for
|
||||
// central/test composition roots that have no SiteCommunicationActor.
|
||||
// The real implementation is ClusterClientSiteAuditClient, which pushes
|
||||
// audit telemetry to central over Akka ClusterClient via the site's
|
||||
// SiteCommunicationActor — the Host wires it directly into the
|
||||
// SiteAuditTelemetryActor's Props.Create call for site roles (it cannot
|
||||
// be a DI singleton because it needs the SiteCommunicationActor IActorRef,
|
||||
// created during Akka bootstrap, not at DI-composition time).
|
||||
services.AddSingleton<ISiteStreamAuditClient, NoOpSiteStreamAuditClient>();
|
||||
|
||||
// M3 Bundle F: site-side dual emitter for cached-call lifecycle
|
||||
// telemetry. ScriptRuntimeContext.ExternalSystem.CachedCall /
|
||||
// Database.CachedWrite resolves this through DI and pushes one combined
|
||||
// packet per lifecycle event; the forwarder writes the audit half
|
||||
// through IAuditWriter and the operational half through the
|
||||
// IOperationTrackingStore. The audit writer is always wired (the M2
|
||||
// chain above); the operational tracking store is SITE-ONLY (registered
|
||||
// by ZB.MOM.WW.ScadaBridge.SiteRuntime). On a Central composition root the tracking
|
||||
// store has no registration, so the factory resolves it with GetService
|
||||
// (returning null) — the forwarder degrades to "audit-only" emission,
|
||||
// mirroring the lazy IAuditWriter chain established in M2.
|
||||
services.AddSingleton<ICachedCallTelemetryForwarder>(sp =>
|
||||
new CachedCallTelemetryForwarder(
|
||||
sp.GetRequiredService<IAuditWriter>(),
|
||||
sp.GetService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.IOperationTrackingStore>(),
|
||||
sp.GetRequiredService<ILogger<CachedCallTelemetryForwarder>>(),
|
||||
// AuditLog-007: INodeIdentityProvider is now required across
|
||||
// every consumer in AddAuditLog. The Host's
|
||||
// SiteServiceRegistration registers it as a singleton on both
|
||||
// site and central paths (InboundAPI-022 / Host registration
|
||||
// sweep), and the AddAuditLogTests fixture provides a
|
||||
// FakeNodeIdentityProvider; a silent GetService() that
|
||||
// returned null would mask a future composition root that
|
||||
// forgot to register the provider.
|
||||
sp.GetRequiredService<INodeIdentityProvider>()));
|
||||
|
||||
// M3 Bundle F: bridge the store-and-forward retry-loop observer hook
|
||||
// to the cached-call forwarder so per-attempt + terminal telemetry
|
||||
// emitted from the S&F retry sweep lands on the same SQLite hot-path
|
||||
// as the script-thread CachedSubmit row. Registered as a singleton
|
||||
// and also bound to ICachedCallLifecycleObserver so AddStoreAndForward
|
||||
// can resolve it through DI (Bundle F StoreAndForward wiring change).
|
||||
// SourceNode-stamping (Task 14): factory-resolved so the
|
||||
// INodeIdentityProvider singleton can be threaded through — the
|
||||
// bridge stamps SiteCallOperational.SourceNode from
|
||||
// INodeIdentityProvider.NodeName on every cached-call lifecycle row.
|
||||
// AuditLog-007: the provider is resolved with GetRequiredService —
|
||||
// SiteServiceRegistration.BindSharedOptions registers it on both
|
||||
// site and central paths, so a missing registration is a
|
||||
// composition-root bug, not a silent null-SourceNode degradation.
|
||||
services.AddSingleton<CachedCallLifecycleBridge>(sp => new CachedCallLifecycleBridge(
|
||||
sp.GetRequiredService<ICachedCallTelemetryForwarder>(),
|
||||
sp.GetRequiredService<ILogger<CachedCallLifecycleBridge>>(),
|
||||
// AuditLog-007: required, matches the other consumers in this
|
||||
// composition root — the provider is always registered by
|
||||
// SiteServiceRegistration.
|
||||
sp.GetRequiredService<INodeIdentityProvider>()));
|
||||
services.AddSingleton<ICachedCallLifecycleObserver>(
|
||||
sp => sp.GetRequiredService<CachedCallLifecycleBridge>());
|
||||
|
||||
// M6 Bundle E (T8): central audit-write failure counter — NoOp default
|
||||
// for site/test composition roots that don't wire the central health
|
||||
// snapshot. AddAuditLogCentralMaintenance below replaces this binding
|
||||
// with the AuditCentralHealthSnapshot implementation so increments
|
||||
// surface on the central dashboard.
|
||||
services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();
|
||||
|
||||
// M4 Bundle B: central direct-write audit writer used by
|
||||
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
|
||||
// emit AuditLog rows that originate ON central, not via site telemetry.
|
||||
// Singleton — the writer is stateless; its per-call scope opens a fresh
|
||||
// IAuditLogRepository (a SCOPED EF Core service registered by
|
||||
// ZB.MOM.WW.ScadaBridge.ConfigurationDatabase). The interface (ICentralAuditWriter)
|
||||
// is intentionally distinct from IAuditWriter so site composition roots
|
||||
// do not accidentally bind it; central composition roots that include
|
||||
// AddConfigurationDatabase get a working implementation transparently.
|
||||
// Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so
|
||||
// NotificationOutboxActor + Inbound API rows are truncated + redacted
|
||||
// before they hit MS SQL.
|
||||
// M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter
|
||||
// so swallowed repo throws bump the central health counter.
|
||||
services.AddSingleton<ICentralAuditWriter>(sp => new CentralAuditWriter(
|
||||
sp,
|
||||
sp.GetRequiredService<ILogger<CentralAuditWriter>>(),
|
||||
sp.GetRequiredService<IAuditPayloadFilter>(),
|
||||
sp.GetRequiredService<ICentralAuditWriteFailureCounter>(),
|
||||
// SourceNode-stamping (Task 12): wire the local node identity so
|
||||
// central-origin rows (Notification Outbox dispatch, Inbound API)
|
||||
// carry the writing node's identifier when the caller hasn't
|
||||
// already supplied one. GetRequiredService — the production
|
||||
// composition root in SiteServiceRegistration registers the
|
||||
// provider as a singleton on both site and central paths.
|
||||
sp.GetRequiredService<INodeIdentityProvider>()));
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M2 Bundle G + M5 Bundle C — swap the default
|
||||
/// <see cref="NoOpAuditWriteFailureCounter"/> and
|
||||
/// <see cref="NoOpAuditRedactionFailureCounter"/> registrations for the
|
||||
/// real <see cref="HealthMetricsAuditWriteFailureCounter"/> /
|
||||
/// <see cref="HealthMetricsAuditRedactionFailureCounter"/> bridges so the
|
||||
/// FallbackAuditWriter primary-failure counter AND the
|
||||
/// DefaultAuditPayloadFilter redactor-failure counter both surface in the
|
||||
/// site health report payload as
|
||||
/// <c>SiteHealthReport.SiteAuditWriteFailures</c> +
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Must be called AFTER both <see cref="AddAuditLog"/> (registers the
|
||||
/// NoOp defaults this method replaces) and
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.HealthMonitoring.ServiceCollectionExtensions.AddHealthMonitoring</c>
|
||||
/// or <c>AddSiteHealthMonitoring</c> (registers the
|
||||
/// <see cref="ISiteHealthCollector"/> the bridges depend on). Resolving
|
||||
/// <see cref="IAuditWriteFailureCounter"/> or
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> without the latter throws
|
||||
/// <see cref="InvalidOperationException"/> at <c>GetRequiredService</c>
|
||||
/// time — by design, since a silent NoOp would mask a misconfiguration.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Idempotent — a sentinel check on the
|
||||
/// <see cref="SiteAuditBacklogReporter"/> hosted-service descriptor
|
||||
/// short-circuits subsequent calls so the hosted service is not
|
||||
/// double-registered (AddHostedService has no TryAdd variant).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site-side only for M5: the central composition root keeps the NoOp
|
||||
/// defaults; the central health-metric surface that would expose
|
||||
/// <c>AuditRedactionFailure</c> next to the existing central counters
|
||||
/// ships in M6.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
|
||||
public static IServiceCollection AddAuditLogHealthMetricsBridge(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// AuditLog-011: guard against double-registration. AddHostedService is
|
||||
// additive (no TryAdd variant) so a second call without this sentinel
|
||||
// would spin up a second SiteAuditBacklogReporter, doubling the 30 s
|
||||
// SQL probe rate and racing on the ISiteHealthCollector snapshot. The
|
||||
// SiteAuditBacklogReporter descriptor is the discriminator: it's only
|
||||
// registered by this helper, so its presence proves the bridge has
|
||||
// already been wired.
|
||||
if (services.Any(d => d.ImplementationType == typeof(SiteAuditBacklogReporter)))
|
||||
{
|
||||
return services;
|
||||
}
|
||||
|
||||
services.Replace(
|
||||
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
|
||||
services.Replace(
|
||||
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
|
||||
// M6 Bundle E (T6): the site-side backlog reporter polls the
|
||||
// SqliteAuditWriter every 30 s and pushes the snapshot into the
|
||||
// collector so the next SiteHealthReport carries a fresh
|
||||
// SiteAuditBacklog field. Registered alongside the other site-only
|
||||
// metric bridges so AddAuditLog (which runs on central too) stays
|
||||
// free of hosted-service registrations that would resolve a missing
|
||||
// ISiteHealthCollector on central.
|
||||
services.AddHostedService<SiteAuditBacklogReporter>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6-T5 Bundle D — central-only registration for the
|
||||
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service plus
|
||||
/// its <see cref="AuditLogPartitionMaintenanceOptions"/> binding. Must be
|
||||
/// called from the Central role's composition root (not from a site
|
||||
/// composition root); the underlying <c>IPartitionMaintenance</c>
|
||||
/// implementation is registered by <c>AddConfigurationDatabase</c> and
|
||||
/// only exists on the central node.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Separated from <see cref="AddAuditLog"/> because <c>AddAuditLog</c> is
|
||||
/// also invoked from site composition roots — silently starting a
|
||||
/// hosted service that resolves an unregistered dependency on a site
|
||||
/// would fail every tick. Keeping the central-only registration in its
|
||||
/// own helper preserves the "every <c>Add*</c> call is safe to issue
|
||||
/// from any composition root" invariant.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
/// <param name="config">Application configuration used to bind partition maintenance options.</param>
|
||||
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
|
||||
public static IServiceCollection AddAuditLogCentralMaintenance(
|
||||
this IServiceCollection services,
|
||||
IConfiguration config)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(config);
|
||||
|
||||
services.AddOptions<AuditLogPartitionMaintenanceOptions>()
|
||||
.Bind(config.GetSection(PartitionMaintenanceSectionName));
|
||||
services.AddHostedService<AuditLogPartitionMaintenanceService>();
|
||||
|
||||
// M6 Bundle E (T8 + T9): central health snapshot — a single object
|
||||
// that owns the CentralAuditWriteFailures + AuditRedactionFailure
|
||||
// Interlocked counters AND surfaces them on
|
||||
// IAuditCentralHealthSnapshot. The same instance is bound to BOTH
|
||||
// writer-side interfaces (ICentralAuditWriteFailureCounter +
|
||||
// IAuditRedactionFailureCounter) so every central-side increment
|
||||
// routes into the shared counters; site nodes keep their existing
|
||||
// Site bridges (registered by AddAuditLogHealthMetricsBridge) so
|
||||
// the same counter type does not shadow the site-side metric.
|
||||
// The snapshot itself has no actor-system dependency — the
|
||||
// per-site stalled latch is fed by SiteAuditTelemetryStalledTracker
|
||||
// which the Akka bootstrap wires up after ActorSystem.Create returns
|
||||
// (the tracker is NOT registered here because its construction
|
||||
// requires ActorSystem, which is not a DI-resolvable singleton).
|
||||
services.AddSingleton<AuditCentralHealthSnapshot>();
|
||||
services.AddSingleton<IAuditCentralHealthSnapshot>(
|
||||
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>());
|
||||
services.Replace(ServiceDescriptor.Singleton<ICentralAuditWriteFailureCounter>(
|
||||
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));
|
||||
// M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter
|
||||
// (registered by AddAuditLog) with the CentralAuditRedactionFailureCounter
|
||||
// bridge so payload-filter throws on CentralAuditWriter /
|
||||
// AuditLogIngestActor paths surface on the central dashboard. The
|
||||
// bridge is a thin wrapper around the AuditCentralHealthSnapshot
|
||||
// singleton so all central redactor failures route into the same
|
||||
// counter as CentralAuditWriteFailures. The site composition root
|
||||
// overrides this binding AGAIN via AddAuditLogHealthMetricsBridge —
|
||||
// central nodes do not call that bridge, so this is the final
|
||||
// binding on a central host. Mirrors the M5 Bundle C
|
||||
// HealthMetricsAuditRedactionFailureCounter shape one-for-one.
|
||||
services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
|
||||
CentralAuditRedactionFailureCounter>());
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Composes the primary <see cref="SqliteAuditWriter"/> with a drop-oldest
|
||||
/// <see cref="RingBufferFallback"/>. Audit writes are best-effort by contract
|
||||
/// (see <see cref="IAuditWriter"/>) — a primary failure must NEVER bubble out
|
||||
/// to the calling script. Failed events are stashed in the ring; on the next
|
||||
/// successful primary write the ring is drained back through the primary in
|
||||
/// FIFO order.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Each primary failure increments <see cref="IAuditWriteFailureCounter"/> so
|
||||
/// Site Health Monitoring can surface a sustained outage as
|
||||
/// <c>SiteAuditWriteFailures</c> (Bundle G).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Errors raised by the ring drain on recovery are logged and silently dropped
|
||||
/// so we don't loop the failure mode — the trigger event itself succeeded, and
|
||||
/// retrying the drain on the NEXT successful write is the recovery path.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FallbackAuditWriter : IAuditWriter
|
||||
{
|
||||
private readonly IAuditWriter _primary;
|
||||
private readonly RingBufferFallback _ring;
|
||||
private readonly IAuditWriteFailureCounter _failureCounter;
|
||||
private readonly ILogger<FallbackAuditWriter> _logger;
|
||||
private readonly IAuditPayloadFilter _filter;
|
||||
private readonly SemaphoreSlim _drainGate = new(1, 1);
|
||||
|
||||
/// <summary>
|
||||
/// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
|
||||
/// here so every event written via the site hot path is truncated +
|
||||
/// header/body/SQL-param redacted before it hits both the primary SQLite
|
||||
/// writer AND the ring fallback. The parameter is optional (defaults to
|
||||
/// no filtering) so the long tail of test composition roots that don't
|
||||
/// care about the filter need no change — the production
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
|
||||
/// always passes the real filter through.
|
||||
/// </summary>
|
||||
/// <param name="primary">The primary audit writer (typically the SQLite writer).</param>
|
||||
/// <param name="ring">Drop-oldest ring buffer used to stash events when the primary fails.</param>
|
||||
/// <param name="failureCounter">Counter incremented on each primary failure for health reporting.</param>
|
||||
/// <param name="logger">Logger for diagnostics.</param>
|
||||
/// <param name="filter">Optional payload filter applied before writing; null means no filtering.</param>
|
||||
public FallbackAuditWriter(
|
||||
IAuditWriter primary,
|
||||
RingBufferFallback ring,
|
||||
IAuditWriteFailureCounter failureCounter,
|
||||
ILogger<FallbackAuditWriter> logger,
|
||||
IAuditPayloadFilter? filter = null)
|
||||
{
|
||||
_primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
||||
_ring = ring ?? throw new ArgumentNullException(nameof(ring));
|
||||
_failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
// AuditLog-008: never default to a null filter — over-redact instead.
|
||||
// SafeDefaultAuditPayloadFilter.Instance performs HTTP header
|
||||
// redaction with the hard-coded sensitive defaults (Authorization,
|
||||
// X-Api-Key, Cookie, Set-Cookie) so a test composition root that
|
||||
// doesn't bind the real options never persists those headers
|
||||
// verbatim. The real DefaultAuditPayloadFilter (truncation + body /
|
||||
// SQL-param redaction) is wired by AddAuditLog and takes precedence.
|
||||
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// Filter once, up-front. The filtered event flows BOTH to the primary
|
||||
// and (on failure) to the ring buffer — so a primary outage that
|
||||
// drains later still hands the SqliteAuditWriter a row that has
|
||||
// already been truncated and redacted. The filter contract is
|
||||
// "MUST NOT throw". AuditLog-008: _filter is now non-null (defaults
|
||||
// to SafeDefaultAuditPayloadFilter so header redaction is always
|
||||
// applied even in composition roots that don't wire the real filter).
|
||||
var filtered = _filter.Apply(evt);
|
||||
|
||||
try
|
||||
{
|
||||
await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Primary down: record the failure, stash in the ring, return
|
||||
// success to the caller. Audit-write failures NEVER abort the
|
||||
// user-facing action (alog.md §7). DO NOT attempt the ring drain
|
||||
// here — primary is throwing, draining would just scramble FIFO
|
||||
// order across re-enqueues.
|
||||
_failureCounter.Increment();
|
||||
_logger.LogWarning(ex,
|
||||
"Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
|
||||
filtered.EventId);
|
||||
// Ring stores the filtered copy so the eventual drain replays a
|
||||
// payload that has already been capped/redacted — no second
|
||||
// filter pass needed on recovery, and no risk of the ring
|
||||
// holding the raw oversized blob in memory.
|
||||
_ring.TryEnqueue(filtered);
|
||||
return;
|
||||
}
|
||||
|
||||
// Primary succeeded — opportunistically drain anything that piled up
|
||||
// in the ring during the outage. Best-effort: a failure during the
|
||||
// drain re-enqueues the popped event and is logged; the next
|
||||
// successful write will retry. Drain order in the audit log is
|
||||
// therefore: <triggering event>, <backlog FIFO>.
|
||||
if (_ring.Count > 0)
|
||||
{
|
||||
await TryDrainRingAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryDrainRingAsync(CancellationToken ct)
|
||||
{
|
||||
// Serialise drains so two concurrent recoveries don't double-replay.
|
||||
if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Pull only what is currently buffered; do NOT wait for new events.
|
||||
// We iterate with a snapshot of Count so we never starve under
|
||||
// concurrent enqueues.
|
||||
var pending = _ring.Count;
|
||||
for (var i = 0; i < pending; i++)
|
||||
{
|
||||
if (!_ring.TryDequeue(out var queued))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _primary.WriteAsync(queued, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Primary fell over again. Put the event back at the head
|
||||
// of the queue is impossible with Channel<T>; route to the
|
||||
// tail (drop-oldest preserves the most-recent picture).
|
||||
_failureCounter.Increment();
|
||||
_logger.LogWarning(ex,
|
||||
"Ring drain re-throw on EventId {EventId}; re-enqueuing.",
|
||||
queued.EventId);
|
||||
_ring.TryEnqueue(queued);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_drainGate.Release();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M5 Bundle C — bridges
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
|
||||
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
|
||||
/// parameter redactor stage throws and the filter has to over-redact the
|
||||
/// offending field) into <see cref="ISiteHealthCollector"/> so the count
|
||||
/// surfaces in the site health report payload as
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
|
||||
/// callers must register <c>AddHealthMonitoring()</c> first so
|
||||
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="ServiceCollectionExtensions.AddAuditLog"/>
|
||||
/// registration keeps <see cref="NoOpAuditRedactionFailureCounter"/> for nodes
|
||||
/// where Site Health Monitoring is not wired (the silent-sink contract —
|
||||
/// redaction failures must NEVER abort the user-facing action, alog.md §7).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Mirrors the M2 Bundle G <see cref="HealthMetricsAuditWriteFailureCounter"/>
|
||||
/// shape one-for-one so the two health-metric bridges age together.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site-side only for M5: the redaction filter also runs on the central
|
||||
/// writers (CentralAuditWriter + AuditLogIngestActor), but the central
|
||||
/// health-metric surface that would expose <c>AuditRedactionFailure</c>
|
||||
/// alongside the existing central counters ships in M6. Until then, the
|
||||
/// central composition root keeps the NoOp default — the redactions still
|
||||
/// happen, they just don't get counted into a health report.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class HealthMetricsAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
|
||||
/// <summary>Initializes the counter with the site health collector it bridges into.</summary>
|
||||
/// <param name="collector">The site health collector that receives the incremented redaction-failure count.</param>
|
||||
public HealthMetricsAuditRedactionFailureCounter(ISiteHealthCollector collector)
|
||||
{
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => _collector.IncrementAuditRedactionFailure();
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M2 Bundle G — bridges <see cref="IAuditWriteFailureCounter"/>
|
||||
/// (incremented by <see cref="FallbackAuditWriter"/> every time the primary
|
||||
/// SQLite writer throws) into <see cref="ISiteHealthCollector"/> so the count
|
||||
/// surfaces in the site health report payload as
|
||||
/// <c>SiteHealthReport.SiteAuditWriteFailures</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
|
||||
/// callers must register <c>AddHealthMonitoring()</c> first so
|
||||
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="AddAuditLog"/>
|
||||
/// registration keeps <see cref="NoOpAuditWriteFailureCounter"/> for nodes
|
||||
/// where Site Health Monitoring is not wired (the silent-sink contract — audit
|
||||
/// write failures must NEVER abort the user-facing action, alog.md §7).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter
|
||||
{
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new <see cref="HealthMetricsAuditWriteFailureCounter"/> backed by the given health collector.
|
||||
/// </summary>
|
||||
/// <param name="collector">The site health collector to increment on each audit write failure.</param>
|
||||
public HealthMetricsAuditWriteFailureCounter(ISiteHealthCollector collector)
|
||||
{
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => _collector.IncrementSiteAuditWriteFailures();
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Lightweight counter sink invoked by <see cref="FallbackAuditWriter"/> every
|
||||
/// time the primary <see cref="SqliteAuditWriter"/> throws on an audit write.
|
||||
/// Bundle G (M2-T11) implements this as a thread-safe Interlocked counter
|
||||
/// bridged into the Site Health Monitoring report payload as
|
||||
/// <c>SiteAuditWriteFailures</c>.
|
||||
/// </summary>
|
||||
public interface IAuditWriteFailureCounter
|
||||
{
|
||||
/// <summary>Increment the audit-write failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditWriteFailureCounter"/> registered by
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/> on
|
||||
/// every node. Bundle G replaces this binding with a real counter that bridges
|
||||
/// into the Site Health Monitoring report payload as
|
||||
/// <c>SiteAuditWriteFailures</c> — until then,
|
||||
/// <see cref="FallbackAuditWriter"/> emits to a silent sink rather than NRE-ing
|
||||
/// on a null collaborator.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Audit-write failures must NEVER abort the user-facing action (alog.md §7),
|
||||
/// so the counter is best-effort by contract. A NoOp default is the correct
|
||||
/// safe fallback while the health metric is being wired in.
|
||||
/// </remarks>
|
||||
public sealed class NoOpAuditWriteFailureCounter : IAuditWriteFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// Intentionally empty. Bundle G overrides this binding with the real
|
||||
// counter once Site Health Monitoring is wired.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Threading.Channels;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Drop-oldest in-memory ring buffer used by <see cref="FallbackAuditWriter"/>
|
||||
/// when the primary SQLite writer is throwing. Capacity is fixed at construction
|
||||
/// (default 1024). When full, the oldest event is silently dropped to make room
|
||||
/// for the newest — preserving the most recent picture of activity in the face
|
||||
/// of an extended SQLite outage — and <see cref="RingBufferOverflowed"/> is
|
||||
/// raised so a health counter can record the loss.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Backed by a <see cref="Channel{T}"/> with
|
||||
/// <see cref="BoundedChannelFullMode.DropOldest"/>. The channel doesn't natively
|
||||
/// notify on drop, so this class compares <c>Reader.Count</c> before and after
|
||||
/// each enqueue: any time we hit capacity and a subsequent enqueue keeps the
|
||||
/// count at capacity, exactly one event has been dropped.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per the M2 plan: the ring is the absolute-last-resort buffer for the
|
||||
/// hot-path; it is NOT a substitute for the bounded
|
||||
/// <see cref="SqliteAuditWriter"/> write queue.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class RingBufferFallback
|
||||
{
|
||||
private readonly Channel<AuditEvent> _channel;
|
||||
private readonly int _capacity;
|
||||
|
||||
/// <summary>
|
||||
/// Raised once each time a drop-oldest overflow occurs. Hooked by
|
||||
/// <see cref="FallbackAuditWriter"/>'s health counter wiring.
|
||||
/// </summary>
|
||||
public event Action? RingBufferOverflowed;
|
||||
|
||||
/// <summary>Initializes the ring buffer with the specified fixed capacity.</summary>
|
||||
/// <param name="capacity">Maximum number of events to buffer; must be greater than zero. Default is 1024.</param>
|
||||
public RingBufferFallback(int capacity = 1024)
|
||||
{
|
||||
if (capacity <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(capacity), "capacity must be > 0.");
|
||||
}
|
||||
|
||||
_capacity = capacity;
|
||||
_channel = Channel.CreateBounded<AuditEvent>(new BoundedChannelOptions(capacity)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.DropOldest,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>Current event count in the ring (for diagnostics/tests).</summary>
|
||||
public int Count => _channel.Reader.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Try to enqueue an event. Returns <see langword="true"/> on success (even
|
||||
/// when an overflow caused an older event to be dropped); returns
|
||||
/// <see langword="false"/> only when the ring has been
|
||||
/// <see cref="Complete"/>-d.
|
||||
/// </summary>
|
||||
/// <param name="evt">The audit event to enqueue.</param>
|
||||
/// <returns><see langword="true"/> if enqueued (or enqueued with overflow); <see langword="false"/> when the channel is completed.</returns>
|
||||
public bool TryEnqueue(AuditEvent evt)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// DropOldest TryWrite always succeeds unless the channel is completed.
|
||||
// Detect overflow by comparing the count before vs. after: if we were
|
||||
// already at capacity and remain at capacity, exactly one event was
|
||||
// dropped to make room for evt.
|
||||
var beforeCount = _channel.Reader.Count;
|
||||
if (!_channel.Writer.TryWrite(evt))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (beforeCount >= _capacity)
|
||||
{
|
||||
// The new event displaced an existing one.
|
||||
RingBufferOverflowed?.Invoke();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Drain the ring in FIFO order. Yields available events immediately and
|
||||
/// then completes when the channel is empty AND <see cref="Complete"/> has
|
||||
/// been called. Callers that only want to drain what's currently buffered
|
||||
/// must call <see cref="Complete"/> first.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token to abort the async enumeration.</param>
|
||||
public async IAsyncEnumerable<AuditEvent> DrainAsync(
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
await foreach (var evt in _channel.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
yield return evt;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking single-item dequeue used by the
|
||||
/// <see cref="FallbackAuditWriter"/> recovery path. Returns
|
||||
/// <see langword="false"/> when the ring is empty.
|
||||
/// </summary>
|
||||
/// <param name="evt">When this returns <see langword="true"/>, contains the dequeued event.</param>
|
||||
/// <returns><see langword="true"/> if an event was dequeued; <see langword="false"/> if the ring is empty.</returns>
|
||||
public bool TryDequeue(out AuditEvent evt) => _channel.Reader.TryRead(out evt!);
|
||||
|
||||
/// <summary>
|
||||
/// Mark the ring as no-more-writes. <see cref="DrainAsync"/> will yield the
|
||||
/// remaining events and then complete.
|
||||
/// </summary>
|
||||
public void Complete() => _channel.Writer.TryComplete();
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
|
||||
/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
|
||||
/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
|
||||
/// <c>SiteAuditBacklog</c> field on the site health report.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
|
||||
/// backlog requires the queue's write lock; doing it inline in
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
|
||||
/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
|
||||
/// a synchronous I/O call on the report path. The hosted-service pattern keeps
|
||||
/// the report path pure and the SQL probe off the report timing budget.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
|
||||
/// across many reports, fine enough that the central dashboard never lags by
|
||||
/// more than one health-report interval. Tunable via
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
|
||||
/// if ops needs a different cadence; for M6 we hard-code the value because the
|
||||
/// brief calls it out explicitly.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
|
||||
/// transient SQLite error never tears down the hosted service — the next tick
|
||||
/// retries. Mirrors <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
|
||||
/// "exception logged, not propagated" contract.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default poll cadence. Half a typical 60 s health-report interval keeps
|
||||
/// the snapshot fresh without spinning the SQL probe more often than
|
||||
/// necessary.
|
||||
/// </summary>
|
||||
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
|
||||
|
||||
private readonly ISiteAuditQueue _queue;
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ILogger<SiteAuditBacklogReporter> _logger;
|
||||
private readonly TimeSpan _refreshInterval;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="SiteAuditBacklogReporter"/>.</summary>
|
||||
/// <param name="queue">The site audit queue used to probe the backlog count.</param>
|
||||
/// <param name="collector">The site health collector that receives the backlog snapshot.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="refreshInterval">Poll interval override; defaults to <see cref="DefaultRefreshInterval"/> (30 s).</param>
|
||||
public SiteAuditBacklogReporter(
|
||||
ISiteAuditQueue queue,
|
||||
ISiteHealthCollector collector,
|
||||
ILogger<SiteAuditBacklogReporter> logger,
|
||||
TimeSpan? refreshInterval = null)
|
||||
{
|
||||
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// First tick runs immediately so the very first health report after
|
||||
// process start carries a real backlog snapshot — without this the
|
||||
// dashboard would show null for the first 30 s after a deploy.
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SafeProbeAsync(CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
|
||||
_collector.UpdateSiteAuditBacklog(snapshot);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Shutdown — let the outer loop exit cleanly.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of probe failure (transient SQLite lock contention, disk
|
||||
// I/O hiccup, …) so the next tick gets a chance.
|
||||
_logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,913 @@
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side SQLite hot-path writer for Audit Log (#23) events. Mirrors the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.SiteEventLogging.SiteEventLogger"/> design — a single
|
||||
/// owned <see cref="SqliteConnection"/> serialised behind a write lock, fed by a
|
||||
/// bounded <see cref="Channel{T}"/> drained on a dedicated background writer
|
||||
/// task — so script-thread callers never block on disk I/O.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The schema is bootstrapped in the constructor (Bundle B-T1). The
|
||||
/// Channel-based <see cref="WriteAsync"/> hot-path + Bundle D
|
||||
/// <see cref="ReadPendingAsync"/> / <see cref="MarkForwardedAsync"/> support
|
||||
/// surface are wired in Bundle B-T2.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site rows always carry <see cref="AuditForwardState.Pending"/> on first
|
||||
/// insert; the central row-shape's <c>IngestedAtUtc</c> column does NOT live in
|
||||
/// the site SQLite schema — central stamps it on ingest.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable, IDisposable
|
||||
{
|
||||
// Microsoft.Data.Sqlite reports a generic SQLITE_CONSTRAINT (error code 19)
|
||||
// on a PRIMARY KEY violation; the extended subcode 1555 (SQLITE_CONSTRAINT_PRIMARYKEY)
|
||||
// is exposed via SqliteException.SqliteExtendedErrorCode but isn't reliably
|
||||
// surfaced across all SQLite builds. We treat any constraint error on insert
|
||||
// as a duplicate-eventid race and swallow it (first-write-wins) — the index
|
||||
// on EventId is the only constraint on this table, so this scope is precise.
|
||||
private const int SqliteErrorConstraint = 19;
|
||||
|
||||
private readonly SqliteConnection _connection;
|
||||
// AuditLog-005: dedicated read-only connection used by GetBacklogStatsAsync,
|
||||
// ReadPendingAsync, ReadPendingSinceAsync, and ReadForwardedAsync so a slow
|
||||
// backlog scan (COUNT(*) over hundreds of thousands of Pending rows under a
|
||||
// central outage) never parks the hot-path writer behind _writeLock.
|
||||
// SQLite-with-WAL allows a second connection on the same file to read
|
||||
// concurrently with the writer; the writer's WAL pragma is set in
|
||||
// InitializeSchema before this connection is opened. The reader connection
|
||||
// has its own _readLock because SqliteConnection itself is not thread-safe
|
||||
// even in read-only mode — multiple read callers can otherwise interleave
|
||||
// commands on the shared connection.
|
||||
private readonly SqliteConnection _readConnection;
|
||||
private readonly object _readLock = new();
|
||||
private readonly SqliteAuditWriterOptions _options;
|
||||
private readonly ILogger<SqliteAuditWriter> _logger;
|
||||
private readonly INodeIdentityProvider _nodeIdentity;
|
||||
private readonly object _writeLock = new();
|
||||
private readonly Channel<PendingAuditEvent> _writeQueue;
|
||||
private readonly Task _writerLoop;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>Initializes a new instance of the SqliteAuditWriter class.</summary>
|
||||
/// <param name="options">Configuration options for the audit writer.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="nodeIdentity">Node identity provider.</param>
|
||||
/// <param name="connectionStringOverride">Optional connection string override.</param>
|
||||
public SqliteAuditWriter(
|
||||
IOptions<SqliteAuditWriterOptions> options,
|
||||
ILogger<SqliteAuditWriter> logger,
|
||||
INodeIdentityProvider nodeIdentity,
|
||||
string? connectionStringOverride = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
ArgumentNullException.ThrowIfNull(nodeIdentity);
|
||||
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_nodeIdentity = nodeIdentity;
|
||||
|
||||
var connectionString = connectionStringOverride
|
||||
?? $"Data Source={_options.DatabasePath};Cache=Shared";
|
||||
_connection = new SqliteConnection(connectionString);
|
||||
_connection.Open();
|
||||
|
||||
InitializeSchema();
|
||||
|
||||
// AuditLog-005: open a second connection for read-only callers
|
||||
// (GetBacklogStatsAsync, ReadPendingAsync, ReadPendingSinceAsync,
|
||||
// ReadForwardedAsync). InitializeSchema set journal_mode=WAL on the
|
||||
// writer connection, which is a database-level setting that persists
|
||||
// for the file — subsequent connections to the same file see WAL and
|
||||
// can read concurrently with the writer without taking _writeLock.
|
||||
// Reuse the same connection string so the read connection sees the
|
||||
// same Data Source / Cache settings as the writer.
|
||||
_readConnection = new SqliteConnection(connectionString);
|
||||
_readConnection.Open();
|
||||
|
||||
_writeQueue = Channel.CreateBounded<PendingAuditEvent>(
|
||||
new BoundedChannelOptions(_options.ChannelCapacity)
|
||||
{
|
||||
// The hot-path enqueue must back-pressure if the background
|
||||
// writer falls behind; a higher-level fallback (Bundle B-T4)
|
||||
// handles truly catastrophic primary failure with a drop-oldest
|
||||
// ring buffer.
|
||||
FullMode = BoundedChannelFullMode.Wait,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
_writerLoop = Task.Run(ProcessWriteQueueAsync);
|
||||
}
|
||||
|
||||
private void InitializeSchema()
|
||||
{
|
||||
// auto_vacuum must be set before any table is created for it to take
|
||||
// effect on a fresh database. INCREMENTAL lets a future
|
||||
// `PRAGMA incremental_vacuum` shrink the file after the 7-day retention
|
||||
// purge — see alog.md §10.
|
||||
using (var pragmaCmd = _connection.CreateCommand())
|
||||
{
|
||||
pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
|
||||
pragmaCmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
// AuditLog-005: enable WAL so a second connection on the same file can
|
||||
// serve read-only callers (GetBacklogStatsAsync, ReadPendingAsync,
|
||||
// ReadPendingSinceAsync, ReadForwardedAsync) concurrently with the
|
||||
// batched writer, decoupling those reads from _writeLock. WAL is a
|
||||
// database-level setting persisted in the file header; setting it on
|
||||
// the writer connection means every connection opened to the file
|
||||
// afterwards inherits WAL behaviour. PRAGMA journal_mode returns the
|
||||
// mode actually adopted ("memory" for ":memory:" / shared-cache memory
|
||||
// mode, "wal" for file-backed) — we don't error if WAL was rejected
|
||||
// because the read connection's correctness does not depend on WAL
|
||||
// itself, only its concurrency advantage does.
|
||||
using (var pragmaCmd = _connection.CreateCommand())
|
||||
{
|
||||
pragmaCmd.CommandText = "PRAGMA journal_mode = WAL";
|
||||
pragmaCmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
CREATE TABLE IF NOT EXISTS AuditLog (
|
||||
EventId TEXT NOT NULL,
|
||||
OccurredAtUtc TEXT NOT NULL,
|
||||
Channel TEXT NOT NULL,
|
||||
Kind TEXT NOT NULL,
|
||||
CorrelationId TEXT NULL,
|
||||
SourceSiteId TEXT NULL,
|
||||
SourceNode TEXT NULL,
|
||||
SourceInstanceId TEXT NULL,
|
||||
SourceScript TEXT NULL,
|
||||
Actor TEXT NULL,
|
||||
Target TEXT NULL,
|
||||
Status TEXT NOT NULL,
|
||||
HttpStatus INTEGER NULL,
|
||||
DurationMs INTEGER NULL,
|
||||
ErrorMessage TEXT NULL,
|
||||
ErrorDetail TEXT NULL,
|
||||
RequestSummary TEXT NULL,
|
||||
ResponseSummary TEXT NULL,
|
||||
PayloadTruncated INTEGER NOT NULL,
|
||||
Extra TEXT NULL,
|
||||
ForwardState TEXT NOT NULL,
|
||||
ExecutionId TEXT NULL,
|
||||
ParentExecutionId TEXT NULL,
|
||||
PRIMARY KEY (EventId)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
|
||||
ON AuditLog (ForwardState, OccurredAtUtc);
|
||||
""";
|
||||
cmd.ExecuteNonQuery();
|
||||
|
||||
// Audit Log #23 (ExecutionId): additively add the ExecutionId column.
|
||||
// CREATE TABLE IF NOT EXISTS above does NOT add columns to an AuditLog
|
||||
// table that already exists from a pre-ExecutionId build, so an
|
||||
// auditlog.db created by an older build needs the column ALTER-ed in.
|
||||
// The file is durable across restart/failover by design (7-day
|
||||
// retention), so without this step every WriteAsync on an upgraded
|
||||
// deployment would bind $ExecutionId against a missing column and the
|
||||
// best-effort write path would silently drop every site audit row.
|
||||
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
|
||||
// probed first and the ALTER skipped when already there. The column is
|
||||
// nullable with no default, so any row written before this migration
|
||||
// reads back ExecutionId = null (back-compat).
|
||||
AddColumnIfMissing("ExecutionId", "TEXT NULL");
|
||||
|
||||
// Audit Log #23 (ParentExecutionId): same idempotent upgrade path as
|
||||
// ExecutionId above. A deployment that already ran the ExecutionId
|
||||
// branch has an auditlog.db with the 21-column schema and no
|
||||
// ParentExecutionId column; CREATE TABLE IF NOT EXISTS cannot add it,
|
||||
// so it is ALTER-ed in here. Nullable with no default — rows written
|
||||
// before this migration read back ParentExecutionId = null.
|
||||
AddColumnIfMissing("ParentExecutionId", "TEXT NULL");
|
||||
|
||||
// SourceNode stamping: same idempotent upgrade path as ExecutionId /
|
||||
// ParentExecutionId above. A deployment that already ran the
|
||||
// ParentExecutionId branch has an auditlog.db with the 22-column
|
||||
// schema and no SourceNode column; CREATE TABLE IF NOT EXISTS cannot
|
||||
// add it, so it is ALTER-ed in here. Nullable with no default — rows
|
||||
// written before this migration read back SourceNode = null.
|
||||
AddColumnIfMissing("SourceNode", "TEXT NULL");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23: additively adds a column to <c>AuditLog</c> only when
|
||||
/// it is not already present (used for <c>ExecutionId</c> and
|
||||
/// <c>ParentExecutionId</c>). SQLite lacks <c>ADD COLUMN IF NOT EXISTS</c>,
|
||||
/// so the schema is probed via <c>PRAGMA table_info</c> first. Idempotent —
|
||||
/// safe to run on every <see cref="InitializeSchema"/>. Mirrors
|
||||
/// <c>StoreAndForwardStorage.AddColumnIfMissingAsync</c>; kept synchronous
|
||||
/// here to match the rest of this writer's bootstrap DDL.
|
||||
/// </summary>
|
||||
private void AddColumnIfMissing(string columnName, string columnDefinition)
|
||||
{
|
||||
using var probe = _connection.CreateCommand();
|
||||
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('AuditLog') WHERE name = $name";
|
||||
probe.Parameters.AddWithValue("$name", columnName);
|
||||
var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
|
||||
if (exists)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
using var alter = _connection.CreateCommand();
|
||||
// Column name + definition are caller-controlled constants, never user
|
||||
// input — safe to interpolate (parameters are not permitted in DDL).
|
||||
alter.CommandText = $"ALTER TABLE AuditLog ADD COLUMN {columnName} {columnDefinition}";
|
||||
alter.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// Site rows always carry a non-null ForwardState; central rows leave it
|
||||
// null. Force Pending on enqueue so callers can pass a bare AuditEvent
|
||||
// without thinking about site-vs-central provenance.
|
||||
var siteEvt = evt.ForwardState is null
|
||||
? evt with { ForwardState = AuditForwardState.Pending }
|
||||
: evt;
|
||||
|
||||
var pending = new PendingAuditEvent(siteEvt);
|
||||
|
||||
// CreateBounded(FullMode=Wait) means WriteAsync will await room rather
|
||||
// than throw when full — exactly the hot-path back-pressure semantics
|
||||
// we want.
|
||||
if (!_writeQueue.Writer.TryWrite(pending))
|
||||
{
|
||||
// The writer is either completed (logger disposed) or the channel
|
||||
// is at capacity. Fall back to the async path which honours the
|
||||
// FullMode=Wait policy.
|
||||
return WriteSlowPathAsync(pending, ct);
|
||||
}
|
||||
|
||||
return pending.Completion.Task;
|
||||
}
|
||||
|
||||
private async Task WriteSlowPathAsync(PendingAuditEvent pending, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _writeQueue.Writer.WriteAsync(pending, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (ChannelClosedException)
|
||||
{
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SqliteAuditWriter),
|
||||
"Event could not be recorded: the audit writer has been disposed."));
|
||||
}
|
||||
|
||||
await pending.Completion.Task.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task ProcessWriteQueueAsync()
|
||||
{
|
||||
var batch = new List<PendingAuditEvent>(_options.BatchSize);
|
||||
|
||||
// ReadAllAsync completes when the channel is marked complete (Dispose).
|
||||
await foreach (var first in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
|
||||
{
|
||||
batch.Clear();
|
||||
batch.Add(first);
|
||||
|
||||
// Pull additional ready events up to BatchSize. TryRead is non-
|
||||
// blocking and lets us amortise the transaction overhead across a
|
||||
// burst of concurrent enqueues.
|
||||
while (batch.Count < _options.BatchSize &&
|
||||
_writeQueue.Reader.TryRead(out var next))
|
||||
{
|
||||
batch.Add(next);
|
||||
}
|
||||
|
||||
FlushBatch(batch);
|
||||
}
|
||||
}
|
||||
|
||||
private void FlushBatch(IReadOnlyList<PendingAuditEvent> batch)
|
||||
{
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SqliteAuditWriter),
|
||||
"Event could not be recorded: the audit writer was disposed before the write completed."));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
using var transaction = _connection.BeginTransaction();
|
||||
try
|
||||
{
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.Transaction = transaction;
|
||||
cmd.CommandText = """
|
||||
INSERT INTO AuditLog (
|
||||
EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId, ParentExecutionId
|
||||
) VALUES (
|
||||
$EventId, $OccurredAtUtc, $Channel, $Kind, $CorrelationId,
|
||||
$SourceSiteId, $SourceNode, $SourceInstanceId, $SourceScript, $Actor, $Target,
|
||||
$Status, $HttpStatus, $DurationMs, $ErrorMessage, $ErrorDetail,
|
||||
$RequestSummary, $ResponseSummary, $PayloadTruncated, $Extra, $ForwardState,
|
||||
$ExecutionId, $ParentExecutionId
|
||||
);
|
||||
""";
|
||||
|
||||
var pEventId = cmd.Parameters.Add("$EventId", SqliteType.Text);
|
||||
var pOccurredAt = cmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
|
||||
var pChannel = cmd.Parameters.Add("$Channel", SqliteType.Text);
|
||||
var pKind = cmd.Parameters.Add("$Kind", SqliteType.Text);
|
||||
var pCorrelationId = cmd.Parameters.Add("$CorrelationId", SqliteType.Text);
|
||||
var pSourceSiteId = cmd.Parameters.Add("$SourceSiteId", SqliteType.Text);
|
||||
var pSourceNode = cmd.Parameters.Add("$SourceNode", SqliteType.Text);
|
||||
var pSourceInstanceId = cmd.Parameters.Add("$SourceInstanceId", SqliteType.Text);
|
||||
var pSourceScript = cmd.Parameters.Add("$SourceScript", SqliteType.Text);
|
||||
var pActor = cmd.Parameters.Add("$Actor", SqliteType.Text);
|
||||
var pTarget = cmd.Parameters.Add("$Target", SqliteType.Text);
|
||||
var pStatus = cmd.Parameters.Add("$Status", SqliteType.Text);
|
||||
var pHttpStatus = cmd.Parameters.Add("$HttpStatus", SqliteType.Integer);
|
||||
var pDurationMs = cmd.Parameters.Add("$DurationMs", SqliteType.Integer);
|
||||
var pErrorMessage = cmd.Parameters.Add("$ErrorMessage", SqliteType.Text);
|
||||
var pErrorDetail = cmd.Parameters.Add("$ErrorDetail", SqliteType.Text);
|
||||
var pRequestSummary = cmd.Parameters.Add("$RequestSummary", SqliteType.Text);
|
||||
var pResponseSummary = cmd.Parameters.Add("$ResponseSummary", SqliteType.Text);
|
||||
var pPayloadTruncated = cmd.Parameters.Add("$PayloadTruncated", SqliteType.Integer);
|
||||
var pExtra = cmd.Parameters.Add("$Extra", SqliteType.Text);
|
||||
var pForwardState = cmd.Parameters.Add("$ForwardState", SqliteType.Text);
|
||||
var pExecutionId = cmd.Parameters.Add("$ExecutionId", SqliteType.Text);
|
||||
var pParentExecutionId = cmd.Parameters.Add("$ParentExecutionId", SqliteType.Text);
|
||||
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
var e = pending.Event;
|
||||
pEventId.Value = e.EventId.ToString();
|
||||
pOccurredAt.Value = e.OccurredAtUtc.ToString("o");
|
||||
pChannel.Value = e.Channel.ToString();
|
||||
pKind.Value = e.Kind.ToString();
|
||||
pCorrelationId.Value = (object?)e.CorrelationId?.ToString() ?? DBNull.Value;
|
||||
pSourceSiteId.Value = (object?)e.SourceSiteId ?? DBNull.Value;
|
||||
// SourceNode-stamping: caller-provided value wins (preserves
|
||||
// rows reconciled in from other nodes via the same writer);
|
||||
// otherwise stamp from the local INodeIdentityProvider. The
|
||||
// event record itself is NOT mutated — stamping is at write
|
||||
// time only. If the provider also returns null (unconfigured
|
||||
// node), the row's SourceNode stays NULL — operators see
|
||||
// "needs config" via the schema, not a magic fallback string.
|
||||
var sourceNode = e.SourceNode ?? _nodeIdentity.NodeName;
|
||||
pSourceNode.Value = (object?)sourceNode ?? DBNull.Value;
|
||||
pSourceInstanceId.Value = (object?)e.SourceInstanceId ?? DBNull.Value;
|
||||
pSourceScript.Value = (object?)e.SourceScript ?? DBNull.Value;
|
||||
pActor.Value = (object?)e.Actor ?? DBNull.Value;
|
||||
pTarget.Value = (object?)e.Target ?? DBNull.Value;
|
||||
pStatus.Value = e.Status.ToString();
|
||||
pHttpStatus.Value = (object?)e.HttpStatus ?? DBNull.Value;
|
||||
pDurationMs.Value = (object?)e.DurationMs ?? DBNull.Value;
|
||||
pErrorMessage.Value = (object?)e.ErrorMessage ?? DBNull.Value;
|
||||
pErrorDetail.Value = (object?)e.ErrorDetail ?? DBNull.Value;
|
||||
pRequestSummary.Value = (object?)e.RequestSummary ?? DBNull.Value;
|
||||
pResponseSummary.Value = (object?)e.ResponseSummary ?? DBNull.Value;
|
||||
pPayloadTruncated.Value = e.PayloadTruncated ? 1 : 0;
|
||||
pExtra.Value = (object?)e.Extra ?? DBNull.Value;
|
||||
pForwardState.Value = (e.ForwardState ?? AuditForwardState.Pending).ToString();
|
||||
pExecutionId.Value = (object?)e.ExecutionId?.ToString() ?? DBNull.Value;
|
||||
pParentExecutionId.Value = (object?)e.ParentExecutionId?.ToString() ?? DBNull.Value;
|
||||
|
||||
try
|
||||
{
|
||||
cmd.ExecuteNonQuery();
|
||||
pending.Completion.TrySetResult();
|
||||
}
|
||||
catch (SqliteException ex) when (ex.SqliteErrorCode == SqliteErrorConstraint)
|
||||
{
|
||||
// Duplicate EventId — first-write-wins (alog.md §11).
|
||||
// Treat as success: the lifecycle event is durably
|
||||
// recorded under the first writer's payload.
|
||||
_logger.LogDebug(ex,
|
||||
"Duplicate EventId {EventId} swallowed by SqliteAuditWriter",
|
||||
e.EventId);
|
||||
pending.Completion.TrySetResult();
|
||||
}
|
||||
}
|
||||
|
||||
transaction.Commit();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
transaction.Rollback();
|
||||
_logger.LogError(ex, "SqliteAuditWriter batch insert failed; faulting {Count} pending events", batch.Count);
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
pending.Completion.TrySetException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AuditLog-001: cached-lifecycle audit kinds that ride the combined-telemetry
|
||||
// drain (joined with the operational tracking row + pushed via
|
||||
// IngestCachedTelemetryAsync into the central dual-write transaction).
|
||||
// ReadPendingAsync EXCLUDES these so the audit-only drain doesn't double-emit
|
||||
// them; ReadPendingCachedTelemetryAsync below is the dedicated read surface
|
||||
// the new SiteAuditTelemetryActor cached-drain uses.
|
||||
private static readonly string[] CachedTelemetryKindNames =
|
||||
{
|
||||
nameof(AuditKind.CachedSubmit),
|
||||
nameof(AuditKind.ApiCallCached),
|
||||
nameof(AuditKind.DbWriteCached),
|
||||
nameof(AuditKind.CachedResolve),
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadPendingAsync(int limit, CancellationToken ct = default)
|
||||
{
|
||||
if (limit <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
|
||||
}
|
||||
|
||||
// AuditLog-005: read via the dedicated _readConnection so this scan
|
||||
// (which can be expensive when the backlog grows under a central
|
||||
// outage) does not block the batched writer on _writeLock. WAL mode
|
||||
// gives us a stable snapshot of the table while writes proceed on the
|
||||
// writer connection. _readLock serialises this connection across
|
||||
// multiple concurrent read callers since SqliteConnection itself is
|
||||
// not thread-safe.
|
||||
// AuditLog-001: NOT IN ($cached1,$cached2,$cached3,$cached4) excludes the
|
||||
// cached-lifecycle kinds — they flow through ReadPendingCachedTelemetryAsync
|
||||
// + the combined-telemetry drain. Kind is stored as the enum's name (see
|
||||
// FlushBatch's pKind.Value), so a string-IN against the constant kind
|
||||
// names matches the on-disk shape exactly.
|
||||
lock (_readLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId, ParentExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending
|
||||
AND Kind NOT IN ($k0, $k1, $k2, $k3)
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
|
||||
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
|
||||
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
|
||||
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
|
||||
cmd.Parameters.AddWithValue("$limit", limit);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(limit, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadPendingCachedTelemetryAsync(
|
||||
int limit, CancellationToken ct = default)
|
||||
{
|
||||
if (limit <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
|
||||
}
|
||||
|
||||
// AuditLog-001: dedicated read surface for the cached-call lifecycle
|
||||
// drain — symmetric to ReadPendingAsync but filtered to the four
|
||||
// cached AuditKinds. Same _readConnection + _readLock pattern so the
|
||||
// hot-path writer is not contended.
|
||||
lock (_readLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId, ParentExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending
|
||||
AND Kind IN ($k0, $k1, $k2, $k3)
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
|
||||
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
|
||||
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
|
||||
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
|
||||
cmd.Parameters.AddWithValue("$limit", limit);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(limit, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns up to <paramref name="limit"/> rows in
|
||||
/// <see cref="AuditForwardState.Forwarded"/>, oldest
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> first, with
|
||||
/// <see cref="AuditEvent.EventId"/> as the deterministic tiebreaker. The
|
||||
/// <see cref="AuditForwardState.Forwarded"/>-specific counterpart of
|
||||
/// <see cref="ReadPendingAsync"/>; used by tests to assert a row reached the
|
||||
/// <see cref="AuditForwardState.Forwarded"/> state specifically (unlike
|
||||
/// <see cref="ReadPendingSinceAsync"/>, which also returns
|
||||
/// <see cref="AuditForwardState.Pending"/> rows).
|
||||
/// </summary>
|
||||
/// <param name="limit">Maximum number of rows to return.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadForwardedAsync(int limit, CancellationToken ct = default)
|
||||
{
|
||||
if (limit <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
|
||||
}
|
||||
|
||||
// AuditLog-005: mirror ReadPendingAsync — read via _readConnection /
|
||||
// _readLock so this query never contends with the batched writer on
|
||||
// _writeLock.
|
||||
lock (_readLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId, ParentExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $forwarded
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
cmd.Parameters.AddWithValue("$limit", limit);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(limit, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task MarkForwardedAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(eventIds);
|
||||
if (eventIds.Count == 0)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
// Build a single IN (...) parameter list so we issue one UPDATE per
|
||||
// batch regardless of size. Each id is bound as its own parameter,
|
||||
// so no string concatenation of user data ever enters the SQL.
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.Append("UPDATE AuditLog SET ForwardState = $forwarded WHERE EventId IN (");
|
||||
for (int i = 0; i < eventIds.Count; i++)
|
||||
{
|
||||
if (i > 0) sb.Append(',');
|
||||
var p = $"$id{i}";
|
||||
sb.Append(p);
|
||||
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
|
||||
}
|
||||
sb.Append(");");
|
||||
cmd.CommandText = sb.ToString();
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
|
||||
cmd.ExecuteNonQuery();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadPendingSinceAsync(
|
||||
DateTime sinceUtc, int batchSize, CancellationToken ct = default)
|
||||
{
|
||||
if (batchSize <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0.");
|
||||
}
|
||||
|
||||
// AuditLog-005: read via _readConnection / _readLock — same lock-
|
||||
// decoupling as ReadPendingAsync.
|
||||
lock (_readLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId, ParentExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState IN ($pending, $forwarded)
|
||||
AND OccurredAtUtc >= $since
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
// Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc
|
||||
// is stored on insert ("o" format) — string comparison is monotonic for
|
||||
// that encoding so we can index-scan against it.
|
||||
cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString(
|
||||
"o", System.Globalization.CultureInfo.InvariantCulture));
|
||||
cmd.Parameters.AddWithValue("$limit", batchSize);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(batchSize, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task MarkReconciledAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(eventIds);
|
||||
if (eventIds.Count == 0)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ")
|
||||
.Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
|
||||
for (int i = 0; i < eventIds.Count; i++)
|
||||
{
|
||||
if (i > 0) sb.Append(',');
|
||||
var p = $"$id{i}";
|
||||
sb.Append(p);
|
||||
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
|
||||
}
|
||||
sb.Append(");");
|
||||
cmd.CommandText = sb.ToString();
|
||||
cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString());
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
|
||||
cmd.ExecuteNonQuery();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
|
||||
{
|
||||
int pendingCount;
|
||||
DateTime? oldestPending;
|
||||
|
||||
// AuditLog-005: read via the dedicated _readConnection (under
|
||||
// _readLock) so this probe — polled every 30 s by SiteAuditBacklogReporter
|
||||
// — never blocks the batched hot-path writer on _writeLock. Under a
|
||||
// central outage the Pending backlog can grow to hundreds of thousands
|
||||
// of rows and the COUNT(*) scan correspondingly stretches; that no
|
||||
// longer adds tail latency to user-facing audit writes.
|
||||
lock (_readLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
|
||||
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
|
||||
// index makes both aggregates cheap (count is a covering scan, min
|
||||
// is the first key).
|
||||
using var cmd = _readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT COUNT(*), MIN(OccurredAtUtc)
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
|
||||
using var reader = cmd.ExecuteReader();
|
||||
reader.Read();
|
||||
pendingCount = reader.GetInt32(0);
|
||||
oldestPending = reader.IsDBNull(1)
|
||||
? null
|
||||
: DateTime.Parse(reader.GetString(1),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.RoundtripKind);
|
||||
}
|
||||
|
||||
// File-size lookup outside the lock — the DatabasePath option is the
|
||||
// canonical source. The connection-string-override branch (used by
|
||||
// some tests) keeps the same DatabasePath value, so this works
|
||||
// uniformly. In-memory / mode=memory paths return 0 because the file
|
||||
// doesn't exist on disk.
|
||||
long onDiskBytes = 0;
|
||||
try
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.DatabasePath) &&
|
||||
!_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
|
||||
!_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
|
||||
File.Exists(_options.DatabasePath))
|
||||
{
|
||||
onDiskBytes = new FileInfo(_options.DatabasePath).Length;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// File system probe is a best-effort health-metric — never abort
|
||||
// a backlog snapshot because stat() failed. Log and report 0.
|
||||
_logger.LogDebug(ex,
|
||||
"SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
|
||||
_options.DatabasePath);
|
||||
}
|
||||
|
||||
return Task.FromResult(new SiteAuditBacklogSnapshot(
|
||||
PendingCount: pendingCount,
|
||||
OldestPendingUtc: oldestPending,
|
||||
OnDiskBytes: onDiskBytes));
|
||||
}
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
private static AuditEvent MapRow(SqliteDataReader reader)
|
||||
{
|
||||
return new AuditEvent
|
||||
{
|
||||
EventId = Guid.Parse(reader.GetString(0)),
|
||||
OccurredAtUtc = DateTime.Parse(reader.GetString(1),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.RoundtripKind),
|
||||
Channel = Enum.Parse<AuditChannel>(reader.GetString(2)),
|
||||
Kind = Enum.Parse<AuditKind>(reader.GetString(3)),
|
||||
CorrelationId = reader.IsDBNull(4) ? null : Guid.Parse(reader.GetString(4)),
|
||||
SourceSiteId = reader.IsDBNull(5) ? null : reader.GetString(5),
|
||||
SourceNode = reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
SourceInstanceId = reader.IsDBNull(7) ? null : reader.GetString(7),
|
||||
SourceScript = reader.IsDBNull(8) ? null : reader.GetString(8),
|
||||
Actor = reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
Target = reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
Status = Enum.Parse<AuditStatus>(reader.GetString(11)),
|
||||
HttpStatus = reader.IsDBNull(12) ? null : reader.GetInt32(12),
|
||||
DurationMs = reader.IsDBNull(13) ? null : reader.GetInt32(13),
|
||||
ErrorMessage = reader.IsDBNull(14) ? null : reader.GetString(14),
|
||||
ErrorDetail = reader.IsDBNull(15) ? null : reader.GetString(15),
|
||||
RequestSummary = reader.IsDBNull(16) ? null : reader.GetString(16),
|
||||
ResponseSummary = reader.IsDBNull(17) ? null : reader.GetString(17),
|
||||
PayloadTruncated = reader.GetInt32(18) != 0,
|
||||
Extra = reader.IsDBNull(19) ? null : reader.GetString(19),
|
||||
ForwardState = Enum.Parse<AuditForwardState>(reader.GetString(20)),
|
||||
ExecutionId = reader.IsDBNull(21) ? null : Guid.Parse(reader.GetString(21)),
|
||||
ParentExecutionId = reader.IsDBNull(22) ? null : Guid.Parse(reader.GetString(22)),
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the audit writer and releases resources.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// AuditLog-006: prefer <see cref="DisposeAsync"/> when possible (DI honours
|
||||
/// <see cref="IAsyncDisposable"/> on singletons). The sync path remains for
|
||||
/// callers that only know about <see cref="IDisposable"/> (e.g. legacy
|
||||
/// composition roots, <c>using</c> statements without <c>await</c>). To
|
||||
/// avoid the classic sync-over-async deadlock on a captured
|
||||
/// <see cref="SynchronizationContext"/> (ASP.NET request thread, Akka
|
||||
/// dispatcher under some configurations), we hop to the thread pool via
|
||||
/// <see cref="Task.Run(Func{Task})"/> before blocking on the result — the
|
||||
/// async continuation inside <see cref="DisposeAsync"/> then resumes on a
|
||||
/// pool thread with no captured context, so <c>GetResult()</c> never waits
|
||||
/// on the very thread the continuation needs.
|
||||
/// </remarks>
|
||||
public void Dispose()
|
||||
{
|
||||
Task.Run(async () => await DisposeAsync().ConfigureAwait(false))
|
||||
.GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>Asynchronously disposes the audit writer and releases resources.</summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
Task? writerLoop;
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
// Stop accepting new events. Completing the channel writer is the
|
||||
// shutdown signal: WriteAsync calls observe the completion and
|
||||
// fault, and the writer loop drains any already-buffered items
|
||||
// before exiting. _disposed is intentionally NOT set here — it
|
||||
// flips only after the loop has fully drained (second lock block
|
||||
// below), so FlushBatch's existing _disposed check guards the
|
||||
// post-drain window when the connection is about to close.
|
||||
_writeQueue.Writer.TryComplete();
|
||||
writerLoop = _writerLoop;
|
||||
}
|
||||
|
||||
// Wait outside the lock — the loop reacquires it for each batch.
|
||||
try
|
||||
{
|
||||
if (writerLoop is not null)
|
||||
{
|
||||
await writerLoop.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
_logger.LogWarning("SqliteAuditWriter writer loop did not drain within 5s of dispose.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// The loop's per-batch try/catch already routed individual failures
|
||||
// to pending TCSes; a top-level fault here is unexpected.
|
||||
_logger.LogError(ex, "SqliteAuditWriter writer loop faulted during dispose.");
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_connection.Dispose();
|
||||
}
|
||||
|
||||
// AuditLog-005: dispose the dedicated read connection after the writer
|
||||
// is fully drained and closed. _readLock is taken to fence out any
|
||||
// in-flight read caller that grabbed the lock before _disposed flipped
|
||||
// — they observe ObjectDisposedException on the next attempt.
|
||||
lock (_readLock)
|
||||
{
|
||||
_readConnection.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>An audit event awaiting persistence by the background writer.</summary>
|
||||
private sealed class PendingAuditEvent
|
||||
{
|
||||
/// <summary>Initializes a new instance of the PendingAuditEvent class.</summary>
|
||||
/// <param name="evt">The audit event to persist.</param>
|
||||
public PendingAuditEvent(AuditEvent evt)
|
||||
{
|
||||
Event = evt;
|
||||
Completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
}
|
||||
|
||||
/// <summary>The audit event to persist.</summary>
|
||||
public AuditEvent Event { get; }
|
||||
/// <summary>Task completion source for write completion signaling.</summary>
|
||||
public TaskCompletionSource Completion { get; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Options for the site-side SQLite hot-path audit writer.
|
||||
/// Mirrors the ZB.MOM.WW.ScadaBridge.SiteEventLogging pattern: a single SQLite connection
|
||||
/// fed by a background writer task draining a bounded
|
||||
/// <see cref="System.Threading.Channels.Channel{T}"/> so script-thread enqueues
|
||||
/// never block on disk I/O.
|
||||
/// </summary>
|
||||
public sealed class SqliteAuditWriterOptions
|
||||
{
|
||||
/// <summary>SQLite database path (or in-memory URI for tests).</summary>
|
||||
public string DatabasePath { get; set; } = "auditlog.db";
|
||||
|
||||
/// <summary>
|
||||
/// Capacity of the bounded write queue. Set high enough that ordinary
|
||||
/// script bursts never fill it; <see cref="System.Threading.Channels.BoundedChannelFullMode.Wait"/>
|
||||
/// applies when the writer falls behind.
|
||||
/// </summary>
|
||||
public int ChannelCapacity { get; set; } = 4096;
|
||||
|
||||
/// <summary>Max number of pending events the writer drains in one transaction.</summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>Soft flush interval the writer enforces when fewer than BatchSize events are queued.</summary>
|
||||
public int FlushIntervalMs { get; set; } = 50;
|
||||
}
|
||||
@@ -0,0 +1,236 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): translates per-attempt
|
||||
/// notifications from the store-and-forward retry loop into one (or two)
|
||||
/// <see cref="CachedCallTelemetry"/> packets and pushes them through
|
||||
/// <see cref="ICachedCallTelemetryForwarder"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The S&F loop's <see cref="ICachedCallLifecycleObserver"/> reports a
|
||||
/// single coarse outcome per attempt; the audit pipeline however models the
|
||||
/// lifecycle as TWO rows on terminal outcomes — an <c>Attempted</c>
|
||||
/// (<see cref="AuditKind.ApiCallCached"/> / <see cref="AuditKind.DbWriteCached"/>)
|
||||
/// row capturing the per-attempt mechanics, plus a <see cref="AuditKind.CachedResolve"/>
|
||||
/// row marking the terminal state for downstream consumers. The bridge fans
|
||||
/// out per outcome:
|
||||
/// </para>
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>TransientFailure</c> -> one Attempted(Failed) row.</description></item>
|
||||
/// <item><description><c>Delivered</c> -> Attempted(Delivered) + CachedResolve(Delivered).</description></item>
|
||||
/// <item><description><c>PermanentFailure</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// <item><description><c>ParkedMaxRetries</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// <b>Best-effort emission (alog.md §7):</b> the bridge itself never throws;
|
||||
/// the underlying forwarder swallows + logs its own failures.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
|
||||
{
|
||||
private readonly ICachedCallTelemetryForwarder _forwarder;
|
||||
private readonly ILogger<CachedCallLifecycleBridge> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// SourceNode-stamping (Task 14): the local node identity provider used to
|
||||
/// stamp <c>SiteCallOperational.SourceNode</c> on every cached-call
|
||||
/// lifecycle row this bridge emits. Optional — when null (legacy hosts /
|
||||
/// tests that don't register the provider) SourceNode stays null and
|
||||
/// central persists the <c>SiteCalls</c> row with SourceNode NULL.
|
||||
/// </summary>
|
||||
private readonly INodeIdentityProvider? _nodeIdentity;
|
||||
|
||||
/// <summary>Initializes a new <see cref="CachedCallLifecycleBridge"/> with the given telemetry forwarder, logger, and optional node identity provider.</summary>
|
||||
/// <param name="forwarder">The telemetry forwarder used to ship cached-call lifecycle events to central.</param>
|
||||
/// <param name="logger">Logger for bridge diagnostics.</param>
|
||||
/// <param name="nodeIdentity">Optional node identity provider used to stamp <c>SourceNode</c> on emitted telemetry rows.</param>
|
||||
public CachedCallLifecycleBridge(
|
||||
ICachedCallTelemetryForwarder forwarder,
|
||||
ILogger<CachedCallLifecycleBridge> logger,
|
||||
INodeIdentityProvider? nodeIdentity = null)
|
||||
{
|
||||
_forwarder = forwarder ?? throw new ArgumentNullException(nameof(forwarder));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_nodeIdentity = nodeIdentity;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task OnAttemptCompletedAsync(
|
||||
CachedCallAttemptContext context, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
try
|
||||
{
|
||||
await EmitAttemptedAsync(context, ct).ConfigureAwait(false);
|
||||
|
||||
if (IsTerminal(context.Outcome))
|
||||
{
|
||||
await EmitResolveAsync(context, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Defensive — both EmitX paths call the forwarder which is itself
|
||||
// best-effort. A throw here is unexpected, but the alog.md §7
|
||||
// contract requires we never propagate.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallLifecycleBridge: unexpected throw for {TrackedOperationId} (Outcome {Outcome})",
|
||||
context.TrackedOperationId, context.Outcome);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task EmitAttemptedAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
// Per-attempt row: kind discriminates channel; status is always
|
||||
// Attempted regardless of outcome (success vs. failure is captured
|
||||
// by the companion HttpStatus / ErrorMessage fields, NOT by flipping
|
||||
// the status — CachedResolve carries the terminal Status). Per the
|
||||
// M3 brief and alog.md §4.
|
||||
var kind = ChannelToAttemptKind(context.Channel);
|
||||
var status = AuditStatus.Attempted;
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: kind,
|
||||
status: status,
|
||||
// Operational status mirror — for the per-attempt row the
|
||||
// operational state is the running status; the bridge always
|
||||
// writes "Attempted" so reconciliation can't roll back.
|
||||
operationalStatus: "Attempted",
|
||||
terminalAtUtc: null,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task EmitResolveAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
var (auditStatus, operationalStatus) = TerminalOutcomeToStatuses(context.Outcome);
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: AuditKind.CachedResolve,
|
||||
status: auditStatus,
|
||||
operationalStatus: operationalStatus,
|
||||
terminalAtUtc: context.OccurredAtUtc,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private CachedCallTelemetry BuildPacket(
|
||||
CachedCallAttemptContext context,
|
||||
AuditKind kind,
|
||||
AuditStatus status,
|
||||
string operationalStatus,
|
||||
DateTime? terminalAtUtc,
|
||||
string? lastError,
|
||||
int? httpStatus)
|
||||
{
|
||||
var channel = ChannelStringToEnum(context.Channel);
|
||||
|
||||
return new CachedCallTelemetry(
|
||||
Audit: new AuditEvent
|
||||
{
|
||||
EventId = Guid.NewGuid(),
|
||||
OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
Channel = channel,
|
||||
Kind = kind,
|
||||
CorrelationId = context.TrackedOperationId.Value,
|
||||
// Audit Log #23 (ExecutionId Task 4): the originating script
|
||||
// execution's per-run correlation id, threaded through the S&F
|
||||
// buffer; null on rows buffered before Task 4 (back-compat).
|
||||
ExecutionId = context.ExecutionId,
|
||||
// Audit Log #23 (ParentExecutionId Task 6): the spawning
|
||||
// inbound-API request's ExecutionId, threaded through the S&F
|
||||
// buffer alongside ExecutionId so the retry-loop cached rows
|
||||
// correlate back to the cross-execution chain. Null for a
|
||||
// non-routed run and on rows buffered before Task 6.
|
||||
ParentExecutionId = context.ParentExecutionId,
|
||||
SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
|
||||
SourceInstanceId = context.SourceInstanceId,
|
||||
// Audit Log #23 (ExecutionId Task 4): SourceScript is now
|
||||
// threaded through the S&F buffer alongside ExecutionId — the
|
||||
// retry-loop cached rows carry the same provenance the
|
||||
// script-side cached rows do. Null on pre-Task-4 buffered rows.
|
||||
SourceScript = context.SourceScript,
|
||||
Target = context.Target,
|
||||
Status = status,
|
||||
HttpStatus = httpStatus,
|
||||
DurationMs = context.DurationMs,
|
||||
ErrorMessage = lastError,
|
||||
ForwardState = AuditForwardState.Pending,
|
||||
},
|
||||
Operational: new SiteCallOperational(
|
||||
TrackedOperationId: context.TrackedOperationId,
|
||||
Channel: context.Channel,
|
||||
Target: context.Target,
|
||||
SourceSite: context.SourceSite,
|
||||
// SourceNode-stamping (Task 14): the local cluster node name
|
||||
// (node-a/node-b on a site). Stamped from the injected
|
||||
// INodeIdentityProvider; null when no provider was wired so
|
||||
// central persists SiteCalls.SourceNode as NULL.
|
||||
SourceNode: _nodeIdentity?.NodeName,
|
||||
Status: operationalStatus,
|
||||
RetryCount: context.RetryCount,
|
||||
LastError: lastError,
|
||||
HttpStatus: httpStatus,
|
||||
CreatedAtUtc: DateTime.SpecifyKind(context.CreatedAtUtc, DateTimeKind.Utc),
|
||||
UpdatedAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
TerminalAtUtc: terminalAtUtc is null
|
||||
? null
|
||||
: DateTime.SpecifyKind(terminalAtUtc.Value, DateTimeKind.Utc)));
|
||||
}
|
||||
|
||||
private static AuditKind ChannelToAttemptKind(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditKind.ApiCallCached,
|
||||
"DbOutbound" => AuditKind.DbWriteCached,
|
||||
// Defensive default — the S&F observer is filtered to cached-call
|
||||
// categories so this branch shouldn't fire in practice.
|
||||
_ => AuditKind.ApiCallCached,
|
||||
};
|
||||
|
||||
private static AuditChannel ChannelStringToEnum(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditChannel.ApiOutbound,
|
||||
"DbOutbound" => AuditChannel.DbOutbound,
|
||||
_ => AuditChannel.ApiOutbound,
|
||||
};
|
||||
|
||||
private static (AuditStatus auditStatus, string operationalStatus) TerminalOutcomeToStatuses(
|
||||
CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered =>
|
||||
(AuditStatus.Delivered, "Delivered"),
|
||||
CachedCallAttemptOutcome.PermanentFailure =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
// TransientFailure isn't terminal — see IsTerminal — but the switch
|
||||
// is exhaustive so we route it through Failed for safety.
|
||||
CachedCallAttemptOutcome.TransientFailure =>
|
||||
(AuditStatus.Failed, "Failed"),
|
||||
_ => (AuditStatus.Failed, "Failed"),
|
||||
};
|
||||
|
||||
private static bool IsTerminal(CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered => true,
|
||||
CachedCallAttemptOutcome.PermanentFailure => true,
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries => true,
|
||||
CachedCallAttemptOutcome.TransientFailure => false,
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,194 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side dual emitter for cached-call lifecycle telemetry (Audit Log #23 /
|
||||
/// M3). Sister to <see cref="SiteAuditTelemetryActor"/>: where the M2 actor
|
||||
/// drains audit-only events, this forwarder takes a combined
|
||||
/// <see cref="CachedCallTelemetry"/> packet and fans it out to the two
|
||||
/// site-local stores in a single call:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>The <see cref="AuditEvent"/> row is written via
|
||||
/// <see cref="IAuditWriter"/> (the site <c>FallbackAuditWriter</c> +
|
||||
/// <c>SqliteAuditWriter</c> chain established in M2).</description></item>
|
||||
/// <item><description>The operational <see cref="SiteCallOperational"/> half
|
||||
/// updates the site-local <c>OperationTracking</c> SQLite store via
|
||||
/// <see cref="IOperationTrackingStore"/>, with the per-lifecycle method
|
||||
/// (<c>Enqueue</c> / <c>Attempt</c> / <c>Terminal</c>) selected from the
|
||||
/// audit row's <see cref="AuditKind"/>.</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract (alog.md §7):</b> a thrown writer OR a thrown
|
||||
/// tracking store must never propagate to the calling script. Both emission
|
||||
/// halves are wrapped in independent try/catch blocks so a SQLite outage on
|
||||
/// one side cannot starve the other — the failure is logged and the call
|
||||
/// returns normally.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Local-write only — the wire push is the drain actor's job.</b> This
|
||||
/// forwarder is deliberately synchronous against the two site-local SQLite
|
||||
/// stores and never pushes to central itself. The site→central transport is
|
||||
/// now live: <c>ClusterClientSiteAuditClient</c> is the production binding of
|
||||
/// <see cref="ISiteStreamAuditClient"/> on site roles (with
|
||||
/// <c>NoOpSiteStreamAuditClient</c> retained only for central/test composition
|
||||
/// roots). The push happens out-of-band: <see cref="SiteAuditTelemetryActor"/>
|
||||
/// sweeps the <c>AuditEvent</c> rows this forwarder wrote — they live in SQLite
|
||||
/// tagged <see cref="AuditForwardState.Pending"/> — and drains them to central
|
||||
/// via that client. A single drain loop therefore covers both the audit-only
|
||||
/// emissions and the cached-call emissions this forwarder produces.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
|
||||
{
|
||||
private readonly IAuditWriter _auditWriter;
|
||||
private readonly IOperationTrackingStore? _trackingStore;
|
||||
private readonly ILogger<CachedCallTelemetryForwarder> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// SourceNode-stamping (Task 14): local node identity provider used to
|
||||
/// stamp the tracking-store row's <c>SourceNode</c> column on
|
||||
/// <c>RecordEnqueueAsync</c>. Optional — when null (legacy / test hosts)
|
||||
/// the column stays NULL on the tracking row.
|
||||
/// </summary>
|
||||
private readonly INodeIdentityProvider? _nodeIdentity;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the forwarder. <paramref name="trackingStore"/> is optional —
|
||||
/// when null only the audit half of the packet is emitted, which matches
|
||||
/// the M3 Bundle F composition-root contract on Central nodes: the
|
||||
/// AuditLog DI surface registers the forwarder unconditionally (mirroring
|
||||
/// the IAuditWriter chain) but the site-only tracking store has no central
|
||||
/// registration. Production site nodes wire both — the central lazy
|
||||
/// resolution is a no-op path kept symmetric with the M2 writer chain.
|
||||
/// </summary>
|
||||
/// <param name="auditWriter">Writer used to persist audit events from the telemetry packet.</param>
|
||||
/// <param name="trackingStore">Optional store for updating operation tracking state; null on central nodes.</param>
|
||||
/// <param name="logger">Logger for this forwarder.</param>
|
||||
/// <param name="nodeIdentity">Optional provider of the current node name stamped on emitted rows.</param>
|
||||
public CachedCallTelemetryForwarder(
|
||||
IAuditWriter auditWriter,
|
||||
IOperationTrackingStore? trackingStore,
|
||||
ILogger<CachedCallTelemetryForwarder> logger,
|
||||
INodeIdentityProvider? nodeIdentity = null)
|
||||
{
|
||||
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_trackingStore = trackingStore;
|
||||
_nodeIdentity = nodeIdentity;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task ForwardAsync(CachedCallTelemetry telemetry, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(telemetry);
|
||||
|
||||
// Independent try/catch — a thrown audit writer must not prevent the
|
||||
// tracking-store update from running (and vice-versa). Both halves
|
||||
// are best-effort.
|
||||
await TryEmitAuditAsync(telemetry, ct).ConfigureAwait(false);
|
||||
await TryEmitTrackingAsync(telemetry, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task TryEmitAuditAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _auditWriter.WriteAsync(telemetry.Audit, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// alog.md §7 best-effort contract — log and swallow. The audit
|
||||
// pipeline's own retry/recovery (RingBufferFallback in the
|
||||
// FallbackAuditWriter) handles transient writer failures upstream;
|
||||
// a throw bubbling up here means the writer's own swallow contract
|
||||
// failed, which is itself best-effort-handled.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
|
||||
telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryEmitTrackingAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
if (_trackingStore is null)
|
||||
{
|
||||
// No site-local tracking store wired — Central composition root or
|
||||
// an integration-test host that skipped AddSiteRuntime. Emitting
|
||||
// through the audit half is still meaningful; the tracking half
|
||||
// is a no-op rather than an error.
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
switch (telemetry.Audit.Kind)
|
||||
{
|
||||
case AuditKind.CachedSubmit:
|
||||
// Enqueue — insert-if-not-exists with the operational
|
||||
// channel as the kind discriminator. RetryCount is fixed
|
||||
// at 0 by the tracking store's INSERT contract.
|
||||
// SourceNode-stamping (Task 14): stamp the local node
|
||||
// name (node-a/node-b) from the injected
|
||||
// INodeIdentityProvider; null when no provider was wired
|
||||
// so the tracking row's SourceNode column stays NULL.
|
||||
await _trackingStore.RecordEnqueueAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Channel,
|
||||
telemetry.Operational.Target,
|
||||
telemetry.Audit.SourceInstanceId,
|
||||
telemetry.Audit.SourceScript,
|
||||
sourceNode: _nodeIdentity?.NodeName,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.ApiCallCached:
|
||||
case AuditKind.DbWriteCached:
|
||||
// Attempt — advance retry counter + last-error/HTTP-status.
|
||||
// Terminal rows are guarded by the store's WHERE clause.
|
||||
await _trackingStore.RecordAttemptAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.RetryCount,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.CachedResolve:
|
||||
// Terminal — first-write-wins on the resolve flip.
|
||||
await _trackingStore.RecordTerminalAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Defensive — only the four cached-lifecycle kinds are
|
||||
// expected on this path. Anything else is logged so a
|
||||
// mis-routed packet is visible but never crashes the
|
||||
// forwarder.
|
||||
_logger.LogWarning(
|
||||
"CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
|
||||
telemetry.Audit.Kind, telemetry.Audit.EventId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: tracking-store emission threw for TrackedOperationId {Id} (Status {Status})",
|
||||
telemetry.Operational.TrackedOperationId, telemetry.Operational.Status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
using Akka.Actor;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Production <see cref="ISiteStreamAuditClient"/> binding for site composition
|
||||
/// roots: pushes audit telemetry to central over Akka <c>ClusterClient</c> via
|
||||
/// the site's <c>SiteCommunicationActor</c>. The actor forwards the command to
|
||||
/// <c>/user/central-communication</c> and the central
|
||||
/// <c>CentralCommunicationActor</c> Asks the <c>AuditLogIngestActor</c> proxy —
|
||||
/// the same command/control transport notifications already use. Wired by the
|
||||
/// Host for site roles; central and test composition roots keep the
|
||||
/// <see cref="NoOpSiteStreamAuditClient"/> DI default (they have no
|
||||
/// <c>SiteCommunicationActor</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Throw-on-failure contract.</b> An Ask timeout or a faulted reply
|
||||
/// (<see cref="Status.Failure"/>) propagates as a thrown exception out of the
|
||||
/// <c>Ingest*Async</c> methods — it is NOT caught and turned into an empty ack.
|
||||
/// The <see cref="SiteAuditTelemetryActor"/> drain loop treats a thrown
|
||||
/// exception as transient and leaves the rows <c>Pending</c> for the next tick.
|
||||
/// Swallowing the fault into an empty ack would be indistinguishable from "zero
|
||||
/// rows accepted" and would silently lose the retry signal. Task 1 confirmed
|
||||
/// the central receiving end does not collapse an ingest fault into an empty
|
||||
/// ack either, so a site-side Ask through the whole path faults cleanly on a
|
||||
/// central-side timeout.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The batches arrive as proto DTOs (<see cref="AuditEventBatch"/> /
|
||||
/// <see cref="CachedTelemetryBatch"/>) because the
|
||||
/// <see cref="SiteAuditTelemetryActor"/> builds them with
|
||||
/// <see cref="AuditEventDtoMapper.ToDto"/>. This client converts them back into
|
||||
/// the <see cref="AuditEvent"/> / <see cref="SiteCall"/> entities the Akka
|
||||
/// command messages carry — the same DTO→entity translation the
|
||||
/// <c>SiteStreamGrpcServer</c> performs for the gRPC reconciliation path.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
|
||||
{
|
||||
private readonly IActorRef _siteCommunicationActor;
|
||||
private readonly TimeSpan _askTimeout;
|
||||
|
||||
/// <param name="siteCommunicationActor">
|
||||
/// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
|
||||
/// over the registered central ClusterClient and routes the reply back to
|
||||
/// this client's Ask.
|
||||
/// </param>
|
||||
/// <param name="askTimeout">
|
||||
/// Ask timeout for the round-trip to central. On expiry the Ask throws
|
||||
/// <see cref="Akka.Actor.AskTimeoutException"/>, which the drain loop treats
|
||||
/// as transient (rows stay <c>Pending</c>).
|
||||
/// </param>
|
||||
public ClusterClientSiteAuditClient(IActorRef siteCommunicationActor, TimeSpan askTimeout)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(siteCommunicationActor);
|
||||
_siteCommunicationActor = siteCommunicationActor;
|
||||
_askTimeout = askTimeout;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var events = new List<AuditEvent>(batch.Events.Count);
|
||||
foreach (var dto in batch.Events)
|
||||
{
|
||||
events.Add(AuditEventDtoMapper.FromDto(dto));
|
||||
}
|
||||
|
||||
// Ask<T> throws AskTimeoutException on timeout and rethrows a
|
||||
// Status.Failure's inner cause — both surface as a thrown exception so
|
||||
// the drain loop keeps the rows Pending. We deliberately do NOT catch.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestAuditEventsReply>(new IngestAuditEventsCommand(events), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var entries = new List<CachedTelemetryEntry>(batch.Packets.Count);
|
||||
foreach (var packet in batch.Packets)
|
||||
{
|
||||
var audit = AuditEventDtoMapper.FromDto(packet.AuditEvent);
|
||||
var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
|
||||
entries.Add(new CachedTelemetryEntry(audit, siteCall));
|
||||
}
|
||||
|
||||
// Same throw-on-failure contract as IngestAuditEventsAsync. The reply
|
||||
// type is IngestCachedTelemetryReply (the central dual-write reply),
|
||||
// distinct from IngestAuditEventsReply.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestCachedTelemetryReply>(new IngestCachedTelemetryCommand(entries), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
private static IngestAck ToAck(IReadOnlyList<Guid> acceptedEventIds)
|
||||
{
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in acceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Mockable abstraction over the central site-audit push surface that
|
||||
/// <see cref="SiteAuditTelemetryActor"/> uses to forward <see cref="AuditEventBatch"/>
|
||||
/// payloads. The production implementation is
|
||||
/// <see cref="ClusterClientSiteAuditClient"/> — a ClusterClient-based client,
|
||||
/// wired in the Host for site roles, that forwards batches to central via the
|
||||
/// site's <c>SiteCommunicationActor</c>. Unit tests substitute via NSubstitute
|
||||
/// against this interface so the actor never needs a live transport.
|
||||
/// </summary>
|
||||
public interface ISiteStreamAuditClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Forwards <paramref name="batch"/> to the central audit-ingest path. The
|
||||
/// returned <see cref="IngestAck"/> carries the <c>accepted_event_ids</c>
|
||||
/// the actor will flip to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// in the site SQLite queue.
|
||||
/// </summary>
|
||||
/// <param name="batch">The batch of audit events to forward.</param>
|
||||
/// <param name="ct">Cancellation token for the operation.</param>
|
||||
Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Forwards the combined <see cref="CachedTelemetryBatch"/> (Audit Log #23)
|
||||
/// to the central cached-telemetry ingest path. Each packet carries both the
|
||||
/// audit row and the operational <c>SiteCalls</c> upsert; central writes both
|
||||
/// in a single MS SQL transaction. Returns the same <see cref="IngestAck"/>
|
||||
/// shape as <see cref="IngestAuditEventsAsync"/> so the site-side forwarder
|
||||
/// can flip the underlying audit rows to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// once central has acknowledged them.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production <see cref="ClusterClientSiteAuditClient"/> forwards over
|
||||
/// the ClusterClient transport; the <see cref="NoOpSiteStreamAuditClient"/>
|
||||
/// DI default (used by central and test composition roots) returns an empty
|
||||
/// ack so no rows are flipped.
|
||||
/// </remarks>
|
||||
/// <param name="batch">The batch of cached-call telemetry packets to forward.</param>
|
||||
/// <param name="ct">Cancellation token for the operation.</param>
|
||||
Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="ISiteStreamAuditClient"/> registered by
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// It is a no-op binding for composition roots that have no
|
||||
/// <c>SiteCommunicationActor</c> — central and test roots. Site roles override
|
||||
/// it in the Host with the ClusterClient-based
|
||||
/// <see cref="ClusterClientSiteAuditClient"/>, which actually forwards audit
|
||||
/// telemetry to central.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Returns an empty <see cref="IngestAck"/> so the
|
||||
/// <see cref="SiteAuditTelemetryActor"/> doesn't flip any rows to
|
||||
/// <c>Forwarded</c> when this NoOp is in effect — rows stay <c>Pending</c>
|
||||
/// until a real client (or a test stub) takes over.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Audit-write paths are best-effort by contract: a NoOp client keeps the
|
||||
/// host running cleanly and is consistent with "audit-write failures never
|
||||
/// abort the user-facing action".
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class NoOpSiteStreamAuditClient : ISiteStreamAuditClient
|
||||
{
|
||||
private static readonly IngestAck EmptyAck = new();
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
// Empty ack — no EventIds will be flipped to Forwarded, so rows stay
|
||||
// Pending until the real ClusterClientSiteAuditClient (or a test stub)
|
||||
// takes over.
|
||||
return Task.FromResult(EmptyAck);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
// Empty ack — same rationale as IngestAuditEventsAsync. The site still
|
||||
// writes the audit + tracking rows to its SQLite stores authoritatively;
|
||||
// central-side state only materialises once the real
|
||||
// ClusterClientSiteAuditClient (or a test stub) is wired in.
|
||||
return Task.FromResult(EmptyAck);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,464 @@
|
||||
using Akka.Actor;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side actor that drains the local SQLite audit queue and pushes Pending
|
||||
/// rows to central via two parallel transports:
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>IngestAuditEvents</c> for the audit-only path —
|
||||
/// sync ApiCall/DbWrite, NotifySend, InboundRequest and similar single-row
|
||||
/// lifecycle events.</description></item>
|
||||
/// <item><description><c>IngestCachedTelemetry</c> for the combined-telemetry
|
||||
/// path — cached-call lifecycle rows (<c>CachedSubmit</c>,
|
||||
/// <c>ApiCallCached</c>/<c>DbWriteCached</c>, <c>CachedResolve</c>) joined
|
||||
/// with the matching <c>OperationTracking</c> row, written at central as a
|
||||
/// single dual-write transaction (AuditLog + SiteCalls).</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The drain self-ticks via two private messages — <c>Drain</c> for the
|
||||
/// audit-only path and <c>CachedDrain</c> for the combined path — each
|
||||
/// scheduled independently. Cadence is options-driven:
|
||||
/// <c>BusyIntervalSeconds</c> when the previous drain found rows (or faulted —
|
||||
/// we want quick recovery), <c>IdleIntervalSeconds</c> when the queue was empty.
|
||||
/// The two drains share the same cadence configuration but advance their own
|
||||
/// timers so a stall on one path does not block the other.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Collaborators are injected as interfaces (<see cref="ISiteAuditQueue"/>,
|
||||
/// <see cref="ISiteStreamAuditClient"/>, optional
|
||||
/// <see cref="IOperationTrackingStore"/>) so unit tests substitute with
|
||||
/// NSubstitute and never touch real SQLite or gRPC. The
|
||||
/// <see cref="IOperationTrackingStore"/> is optional — central composition
|
||||
/// roots and tests that don't exercise the cached path can leave it null, in
|
||||
/// which case the cached-drain scheduler is never armed.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write paths must be fail-safe — a thrown
|
||||
/// exception inside the actor MUST NOT crash it. Both Drain handlers wrap
|
||||
/// their pipelines in a top-level try/catch that logs and re-schedules; the
|
||||
/// actor's <see cref="SupervisorStrategy"/> defaults to
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultStrategy"/>'s Restart for
|
||||
/// child actors — but this actor has no children, so the catch is what
|
||||
/// matters.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// AuditLog-001: wires the previously-unreachable combined-telemetry transport.
|
||||
/// Prior to this the cached audit rows flowed through the audit-only drain via
|
||||
/// <c>IngestAuditEventsAsync</c> and the central <c>OnCachedTelemetryAsync</c>
|
||||
/// dual-write handler was dead production code; the operational <c>SiteCalls</c>
|
||||
/// half was never sent to central.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SiteAuditTelemetryActor : ReceiveActor
|
||||
{
|
||||
private readonly ISiteAuditQueue _queue;
|
||||
private readonly ISiteStreamAuditClient _client;
|
||||
private readonly IOperationTrackingStore? _trackingStore;
|
||||
private readonly SiteAuditTelemetryOptions _options;
|
||||
private readonly ILogger<SiteAuditTelemetryActor> _logger;
|
||||
private ICancelable? _pendingTick;
|
||||
private ICancelable? _pendingCachedTick;
|
||||
// AuditLog-010: per-actor lifecycle CTS so an in-flight drain (queue read,
|
||||
// gRPC push, mark-forwarded write) is actually cancelled when the actor is
|
||||
// stopped — without it, a stuck IngestAuditEventsAsync would hold the
|
||||
// continuation through CoordinatedShutdown's actor-system terminate window.
|
||||
// Cancelled in PostStop; never reset (the actor is single-lifetime).
|
||||
// The same CTS gates the cached-drain pipeline (queue read + tracking
|
||||
// lookup + gRPC push) so both paths observe shutdown cooperatively.
|
||||
private readonly CancellationTokenSource _lifecycleCts = new();
|
||||
|
||||
/// <summary>Initializes the actor with its drain queue, gRPC client, options, and logger.</summary>
|
||||
/// <param name="queue">The site-local SQLite audit queue to drain.</param>
|
||||
/// <param name="client">The gRPC client used to push audit events to central.</param>
|
||||
/// <param name="options">Telemetry options controlling drain intervals and batch size.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="trackingStore">
|
||||
/// Optional site-local operation tracking store. When supplied the actor
|
||||
/// runs the combined-telemetry cached-drain in parallel with the audit-only
|
||||
/// drain; when null (central composition roots, tests that don't exercise
|
||||
/// cached calls) the cached scheduler is never armed and only the
|
||||
/// audit-only drain runs.
|
||||
/// </param>
|
||||
public SiteAuditTelemetryActor(
|
||||
ISiteAuditQueue queue,
|
||||
ISiteStreamAuditClient client,
|
||||
IOptions<SiteAuditTelemetryOptions> options,
|
||||
ILogger<SiteAuditTelemetryActor> logger,
|
||||
IOperationTrackingStore? trackingStore = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(queue);
|
||||
ArgumentNullException.ThrowIfNull(client);
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_queue = queue;
|
||||
_client = client;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_trackingStore = trackingStore;
|
||||
|
||||
ReceiveAsync<Drain>(_ => OnDrainAsync());
|
||||
ReceiveAsync<CachedDrain>(_ => OnCachedDrainAsync());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
// Initial ticks fire on the busy interval so both drains start polling
|
||||
// soon after host startup. A subsequent empty drain will move to the
|
||||
// idle interval naturally.
|
||||
ScheduleNext(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
|
||||
if (_trackingStore is not null)
|
||||
{
|
||||
ScheduleNextCached(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_pendingTick?.Cancel();
|
||||
_pendingCachedTick?.Cancel();
|
||||
// AuditLog-010: cancel any in-flight drain so a stuck queue read or
|
||||
// gRPC push does not hold the continuation past actor stop.
|
||||
try
|
||||
{
|
||||
_lifecycleCts.Cancel();
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// PostStop may run after a prior Dispose path — benign.
|
||||
}
|
||||
_lifecycleCts.Dispose();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private async Task OnDrainAsync()
|
||||
{
|
||||
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
|
||||
// AuditLog-010: route every async dependency call through the
|
||||
// per-actor lifecycle token so PostStop cancellation actually
|
||||
// propagates into the queue read, the gRPC push, and the
|
||||
// mark-forwarded write. OperationCanceledException is swallowed by
|
||||
// the catch-all below.
|
||||
var ct = _lifecycleCts.Token;
|
||||
try
|
||||
{
|
||||
var pending = await _queue.ReadPendingAsync(_options.BatchSize, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (pending.Count == 0)
|
||||
{
|
||||
// No rows — settle into the idle cadence until the next write
|
||||
// bumps us back into the busy cadence.
|
||||
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
|
||||
return;
|
||||
}
|
||||
|
||||
var batch = BuildBatch(pending);
|
||||
|
||||
IngestAck ack;
|
||||
try
|
||||
{
|
||||
ack = await _client.IngestAuditEventsAsync(batch, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// gRPC fault — leave the rows in Pending so the next drain
|
||||
// retries. Bundle D's brief: "On gRPC exception (any), log
|
||||
// Warning, schedule next Drain in BusyIntervalSeconds."
|
||||
_logger.LogWarning(ex,
|
||||
"IngestAuditEvents push failed for {Count} pending events; will retry next drain.",
|
||||
pending.Count);
|
||||
return;
|
||||
}
|
||||
|
||||
var acceptedIds = ParseAcceptedIds(ack);
|
||||
if (acceptedIds.Count > 0)
|
||||
{
|
||||
await _queue.MarkForwardedAsync(acceptedIds, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all so a SQLite hiccup or mapper bug never crashes the
|
||||
// actor. The next tick is still scheduled in the finally block.
|
||||
_logger.LogError(ex, "Unexpected error during audit-log telemetry drain.");
|
||||
}
|
||||
finally
|
||||
{
|
||||
// AuditLog-010: if the actor is already shutting down, do not
|
||||
// arm another tick — the scheduler would fire after PostStop and
|
||||
// the message would land in dead letters.
|
||||
if (!_lifecycleCts.IsCancellationRequested)
|
||||
{
|
||||
ScheduleNext(nextDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-001: combined-telemetry drain. Reads cached-lifecycle audit
|
||||
/// rows, joins each with the matching <see cref="IOperationTrackingStore"/>
|
||||
/// snapshot, builds a <see cref="CachedTelemetryBatch"/>, and pushes via
|
||||
/// <see cref="ISiteStreamAuditClient.IngestCachedTelemetryAsync"/>. Rows
|
||||
/// whose tracking snapshot is missing (race with retention purge / late
|
||||
/// audit row) are logged + skipped — the operational half will be
|
||||
/// re-emitted on the next lifecycle event, and the audit row stays
|
||||
/// <see cref="Commons.Types.Enums.AuditForwardState.Pending"/> so a later
|
||||
/// drain (or reconciliation pull) can revisit it.
|
||||
/// </summary>
|
||||
private async Task OnCachedDrainAsync()
|
||||
{
|
||||
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
|
||||
var ct = _lifecycleCts.Token;
|
||||
try
|
||||
{
|
||||
// _trackingStore is non-null by construction here — the cached
|
||||
// scheduler is only armed when it was supplied (see PreStart).
|
||||
// Defensive check kept for clarity and to silence the compiler's
|
||||
// null-flow analysis.
|
||||
if (_trackingStore is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var pending = await _queue
|
||||
.ReadPendingCachedTelemetryAsync(_options.BatchSize, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (pending.Count == 0)
|
||||
{
|
||||
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
|
||||
return;
|
||||
}
|
||||
|
||||
var batch = new CachedTelemetryBatch();
|
||||
var emittedEventIds = new List<Guid>(pending.Count);
|
||||
|
||||
foreach (var auditRow in pending)
|
||||
{
|
||||
if (auditRow.CorrelationId is null)
|
||||
{
|
||||
// CorrelationId carries the TrackedOperationId for cached
|
||||
// rows — see CachedCallLifecycleBridge.BuildPacket. Without
|
||||
// it we can't look up the tracking row; log + skip so the
|
||||
// bad row doesn't block the rest of the batch. The audit
|
||||
// row stays Pending (still not in emittedEventIds) and
|
||||
// central reconciliation will pick it up.
|
||||
_logger.LogWarning(
|
||||
"Cached-telemetry drain: audit row {EventId} ({Kind}) has no CorrelationId; skipping.",
|
||||
auditRow.EventId, auditRow.Kind);
|
||||
continue;
|
||||
}
|
||||
|
||||
TrackingStatusSnapshot? snapshot;
|
||||
try
|
||||
{
|
||||
snapshot = await _trackingStore
|
||||
.GetStatusAsync(new TrackedOperationId(auditRow.CorrelationId.Value), ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// A tracking-store throw must NOT abort the rest of the
|
||||
// batch — the audit half is best-effort. Log and skip
|
||||
// this row; it stays Pending for the next drain.
|
||||
_logger.LogWarning(ex,
|
||||
"Cached-telemetry drain: tracking lookup threw for {EventId} (TrackedOperationId {Tid}); skipping.",
|
||||
auditRow.EventId, auditRow.CorrelationId);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (snapshot is null)
|
||||
{
|
||||
// No tracking row — possible if the audit row is older
|
||||
// than the tracking retention window, or the tracking
|
||||
// store was reset. The audit half remains valid and will
|
||||
// be picked up by central reconciliation; skip the
|
||||
// combined push for this row.
|
||||
_logger.LogWarning(
|
||||
"Cached-telemetry drain: no tracking snapshot for {EventId} (TrackedOperationId {Tid}); skipping.",
|
||||
auditRow.EventId, auditRow.CorrelationId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var packet = BuildCachedPacket(auditRow, snapshot);
|
||||
batch.Packets.Add(packet);
|
||||
emittedEventIds.Add(auditRow.EventId);
|
||||
}
|
||||
|
||||
if (batch.Packets.Count == 0)
|
||||
{
|
||||
// Every row in this read was skipped (no CorrelationId / no
|
||||
// tracking snapshot). Leave them Pending and try again next
|
||||
// drain — the underlying race normally resolves on its own.
|
||||
return;
|
||||
}
|
||||
|
||||
IngestAck ack;
|
||||
try
|
||||
{
|
||||
ack = await _client.IngestCachedTelemetryAsync(batch, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"IngestCachedTelemetry push failed for {Count} cached events; will retry next drain.",
|
||||
batch.Packets.Count);
|
||||
return;
|
||||
}
|
||||
|
||||
var acceptedIds = ParseAcceptedIds(ack);
|
||||
if (acceptedIds.Count > 0)
|
||||
{
|
||||
await _queue.MarkForwardedAsync(acceptedIds, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unexpected error during cached-telemetry drain.");
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (!_lifecycleCts.IsCancellationRequested && _trackingStore is not null)
|
||||
{
|
||||
ScheduleNextCached(nextDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static AuditEventBatch BuildBatch(IReadOnlyList<AuditEvent> events)
|
||||
{
|
||||
var batch = new AuditEventBatch();
|
||||
foreach (var e in events)
|
||||
{
|
||||
batch.Events.Add(AuditEventDtoMapper.ToDto(e));
|
||||
}
|
||||
return batch;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-001: build the combined wire packet from one cached audit row
|
||||
/// + its matching operational tracking snapshot. The operational state
|
||||
/// reflects the latest tracking row at emission time (not the per-event
|
||||
/// status the audit row implies) because central's <c>SiteCalls</c>
|
||||
/// upsert is monotonic — it never rolls back. The audit row preserves
|
||||
/// per-event lifecycle granularity for the audit trail.
|
||||
/// </summary>
|
||||
private static CachedTelemetryPacket BuildCachedPacket(
|
||||
AuditEvent auditRow, TrackingStatusSnapshot snapshot)
|
||||
{
|
||||
var sourceSite = auditRow.SourceSiteId ?? string.Empty;
|
||||
// Channel string form mirrors the AuditChannel-to-string convention used
|
||||
// by SiteCallOperational + CachedCallLifecycleBridge.BuildPacket.
|
||||
var channelString = auditRow.Channel.ToString();
|
||||
var target = auditRow.Target ?? snapshot.TargetSummary ?? string.Empty;
|
||||
|
||||
var operationalDto = new SiteCallOperationalDto
|
||||
{
|
||||
TrackedOperationId = snapshot.Id.Value.ToString("D"),
|
||||
Channel = channelString,
|
||||
Target = target,
|
||||
SourceSite = sourceSite,
|
||||
SourceNode = snapshot.SourceNode ?? string.Empty,
|
||||
Status = snapshot.Status,
|
||||
RetryCount = snapshot.RetryCount,
|
||||
LastError = snapshot.LastError ?? string.Empty,
|
||||
CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.CreatedAtUtc)),
|
||||
UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.UpdatedAtUtc)),
|
||||
};
|
||||
if (snapshot.HttpStatus.HasValue)
|
||||
{
|
||||
operationalDto.HttpStatus = snapshot.HttpStatus.Value;
|
||||
}
|
||||
if (snapshot.TerminalAtUtc.HasValue)
|
||||
{
|
||||
operationalDto.TerminalAtUtc =
|
||||
Timestamp.FromDateTime(EnsureUtc(snapshot.TerminalAtUtc.Value));
|
||||
}
|
||||
|
||||
return new CachedTelemetryPacket
|
||||
{
|
||||
AuditEvent = AuditEventDtoMapper.ToDto(auditRow),
|
||||
Operational = operationalDto,
|
||||
};
|
||||
}
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
private static IReadOnlyList<Guid> ParseAcceptedIds(IngestAck ack)
|
||||
{
|
||||
if (ack.AcceptedEventIds.Count == 0)
|
||||
{
|
||||
return Array.Empty<Guid>();
|
||||
}
|
||||
|
||||
var list = new List<Guid>(ack.AcceptedEventIds.Count);
|
||||
foreach (var raw in ack.AcceptedEventIds)
|
||||
{
|
||||
if (Guid.TryParse(raw, out var id))
|
||||
{
|
||||
list.Add(id);
|
||||
}
|
||||
// Malformed ids are ignored — central should never emit them, but
|
||||
// we refuse to crash the actor over a bad string.
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private void ScheduleNext(TimeSpan delay)
|
||||
{
|
||||
_pendingTick?.Cancel();
|
||||
_pendingTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
|
||||
delay,
|
||||
Self,
|
||||
Drain.Instance,
|
||||
Self);
|
||||
}
|
||||
|
||||
private void ScheduleNextCached(TimeSpan delay)
|
||||
{
|
||||
_pendingCachedTick?.Cancel();
|
||||
_pendingCachedTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
|
||||
delay,
|
||||
Self,
|
||||
CachedDrain.Instance,
|
||||
Self);
|
||||
}
|
||||
|
||||
/// <summary>Self-tick message that triggers an audit-only drain cycle.</summary>
|
||||
private sealed class Drain
|
||||
{
|
||||
public static readonly Drain Instance = new();
|
||||
private Drain() { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Self-tick message that triggers a combined-telemetry drain cycle.
|
||||
/// AuditLog-001: introduced alongside the cached-drain to keep the two
|
||||
/// paths' cadences independent — a stall on one does not block the other.
|
||||
/// </summary>
|
||||
private sealed class CachedDrain
|
||||
{
|
||||
public static readonly CachedDrain Instance = new();
|
||||
private CachedDrain() { }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the site-side <see cref="SiteAuditTelemetryActor"/> drain
|
||||
/// loop. Defaults mirror Bundle D's plan: drain every 5 s while rows are
|
||||
/// flowing (busy), every 30 s when the queue is empty (idle).
|
||||
/// </summary>
|
||||
public sealed class SiteAuditTelemetryOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows read from the site SQLite queue and pushed in a single gRPC batch.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>
|
||||
/// Delay between drains when the previous drain found at least one Pending
|
||||
/// row OR the previous push faulted. Re-drain quickly to keep telemetry
|
||||
/// flowing and to retry transient gRPC errors.
|
||||
/// </summary>
|
||||
public int BusyIntervalSeconds { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Delay between drains when the previous drain found no Pending rows.
|
||||
/// Longer interval avoids hammering an idle SQLite + gRPC channel.
|
||||
/// </summary>
|
||||
public int IdleIntervalSeconds { get; set; } = 30;
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!-- Bundle D D1: SiteAuditTelemetryActor + (D2) AuditLogIngestActor live
|
||||
in this project, so Akka is an explicit dependency. -->
|
||||
<PackageReference Include="Akka" />
|
||||
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
|
||||
<!-- Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call Audit (#22).
|
||||
IAuditLogRepository is registered by ZB.MOM.WW.ScadaBridge.ConfigurationDatabase; the project
|
||||
reference is documented here so M2 writers + telemetry actors can depend on it. -->
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.csproj" />
|
||||
<!-- Communication carries the IngestAuditEvents proto + DTOs (#23 M2 site sync). -->
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Communication/ZB.MOM.WW.ScadaBridge.Communication.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.AuditLog.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user