diff --git a/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md b/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md new file mode 100644 index 0000000..aebaf93 --- /dev/null +++ b/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md @@ -0,0 +1,19 @@ +# Audit Log #23 — M6 Reconciliation + Purge + Partition Maintenance + Health Metrics + +> **For Claude:** subagent-driven-development with bundled cadence. + +**Goal:** Self-healing telemetry (5-min reconciliation pull), monthly partition rollover, daily partition-switch purge with drop-and-rebuild around UX_AuditLog_EventId, all five health metrics live (SiteAuditBacklog, SiteAuditWriteFailures, SiteAuditTelemetryStalled, CentralAuditWriteFailures, AuditRedactionFailure). + +**M5 realities baked in:** AuditRedactionFailure counter is site-only — M6-T9 surfaces it centrally. SwitchOutPartitionAsync ships as NotSupportedException stub from M1; M6-T4 replaces it with the drop-DROP-INDEX → SWITCH PARTITION → DROP staging → CREATE UNIQUE NONCLUSTERED INDEX dance. Partition function pre-seeded Jan 2026 – Dec 2027; M6-T5 SPLITs new boundaries forward. + +**Bundles:** +- Bundle A — Proto + site handler (T1, T2) +- Bundle B — Reconciliation actor (T3) +- Bundle C — Purge actor + drop-and-rebuild repository fix (T4) +- Bundle D — Partition maintenance hosted service (T5) +- Bundle E — Health metrics (T6, T7, T8, T9) +- Bundle F — Integration tests (T10, T11, T12) + +Final cross-bundle review + merge. + +**Note**: M2 noted NoOpSiteStreamAuditClient stays in production until "M6 wires the real client". M6-T1+T2 add the PULL RPC; the actual production PUSH client (real implementation of ISiteStreamAuditClient.IngestAuditEventsAsync + IngestCachedTelemetryAsync) is the bigger lift. M6 will add the real client IF feasible within scope OR defer to a follow-up. Decision: try in Bundle A (alongside the proto extension); if scope blows up, the NoOp stays. diff --git a/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs new file mode 100644 index 0000000..e728c51 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs @@ -0,0 +1,80 @@ +using System.Collections.Concurrent; +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of +/// . Owns thread-safe +/// counters for +/// CentralAuditWriteFailures + AuditRedactionFailure and a +/// per-site latched stalled-state map fed by the +/// . Also implements the +/// writer surfaces ( + +/// ) so a single concrete object +/// is the source of truth — DI binds those two interfaces to this same +/// singleton instance on the central composition root. +/// +/// +/// +/// Why one type for read + write. The writer interfaces are tiny +/// (Increment()) and the read surface needs visibility of those +/// counters anyway — having a single class own both means the +/// Interlocked field IS the snapshot value, no extra plumbing needed. +/// Mirrors the +/// pattern where +/// the collector both receives and exposes the metric. +/// +/// +/// Stalled-state plumbing. The per-site stalled latch lives directly +/// on this snapshot. is the +/// EventStream subscriber that pushes +/// publications in via +/// . Keeping the dictionary on this type (rather +/// than reading the tracker on every access) lets the snapshot be constructed +/// without an dependency — the tracker +/// is wired up later from the Akka bootstrap, once the system is built. +/// +/// +public sealed class AuditCentralHealthSnapshot + : IAuditCentralHealthSnapshot, + ICentralAuditWriteFailureCounter, + IAuditRedactionFailureCounter +{ + private int _centralAuditWriteFailures; + private int _auditRedactionFailure; + private readonly ConcurrentDictionary _stalled = new(); + + /// + public int CentralAuditWriteFailures => + Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0); + + /// + public int AuditRedactionFailure => + Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0); + + /// + public IReadOnlyDictionary SiteAuditTelemetryStalled => + new Dictionary(_stalled); + + /// + /// Apply a publication + /// observed by . Public + /// so the tracker (which lives in the same assembly but is constructed + /// later from the Akka host) can push without a friend reference; + /// readers should call . + /// + public void ApplyStalled(SiteAuditTelemetryStalledChanged evt) + { + if (evt is null) return; + _stalled[evt.SiteId] = evt.Stalled; + } + + /// + void ICentralAuditWriteFailureCounter.Increment() => + Interlocked.Increment(ref _centralAuditWriteFailures); + + /// + void IAuditRedactionFailureCounter.Increment() => + Interlocked.Increment(ref _auditRedactionFailure); +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs b/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs index 8e7f21b..61a6daf 100644 --- a/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs +++ b/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs @@ -124,6 +124,7 @@ public class AuditLogIngestActor : ReceiveActor IServiceScope? scope = null; IAuditLogRepository repository; IAuditPayloadFilter? filter = null; + ICentralAuditWriteFailureCounter? failureCounter = null; if (_injectedRepository is not null) { repository = _injectedRepository; @@ -133,6 +134,10 @@ public class AuditLogIngestActor : ReceiveActor scope = _serviceProvider!.CreateScope(); repository = scope.ServiceProvider.GetRequiredService(); filter = scope.ServiceProvider.GetService(); + // M6 Bundle E (T8): central health counter is best-effort — + // unregistered (test composition roots) means the per-row catch + // simply logs without surfacing on the health dashboard. + failureCounter = scope.ServiceProvider.GetService(); } try @@ -157,6 +162,10 @@ public class AuditLogIngestActor : ReceiveActor { // Per-row catch — one bad row never sinks the whole batch. // The row stays Pending at the site; the next drain retries. + // M6 Bundle E (T8): bump the central health counter so a + // sustained insert-throw failure surfaces on the dashboard. + try { failureCounter?.Increment(); } + catch { /* counter must never throw — defence in depth */ } _logger.LogError(ex, "Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.", evt.EventId); @@ -204,6 +213,10 @@ public class AuditLogIngestActor : ReceiveActor // never throw, so we can apply it inside the per-entry try // without risking an unbounded blast radius. var filter = scope.ServiceProvider.GetService(); + // M6 Bundle E (T8): same best-effort central health counter as + // the OnIngestAsync path — null on test composition roots that + // skip the registration. + var failureCounter = scope.ServiceProvider.GetService(); foreach (var entry in cmd.Entries) { @@ -240,6 +253,10 @@ public class AuditLogIngestActor : ReceiveActor // EventId is NOT added to `accepted` so the site keeps its // row Pending and retries on the next drain. Other entries // in the batch continue with their own transactions. + // M6 Bundle E (T8): bump the central health counter so a + // sustained dual-write failure surfaces on the dashboard. + try { failureCounter?.Increment(); } + catch { /* counter must never throw — defence in depth */ } _logger.LogError( ex, "Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.", diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs new file mode 100644 index 0000000..317e6e7 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs @@ -0,0 +1,37 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central +/// hosted service (M6-T5). +/// Defaults: once every 24 hours, keep at least one future monthly +/// boundary ahead of . +/// +/// +/// +/// The hosted service drives a daily roll-forward of +/// pf_AuditLog_Month: each tick reads the current max boundary and +/// SPLITs new monthly boundaries until at least +/// future months are covered. The 1-month +/// default is intentionally conservative — anything less risks an +/// end-of-month race where inserts land in the unbounded tail partition; +/// anything more wastes nothing but represents premature commitment. +/// +/// +/// The 24-hour cadence is the cheapest interval that still guarantees +/// at-most-one missed boundary in steady state (even a hard failover the +/// hosted service can recover on its very next tick). Lowering this below +/// an hour would generate more metadata churn than it saves. +/// +/// +public sealed class AuditLogPartitionMaintenanceOptions +{ + /// Period of the maintenance tick in seconds (default 86 400 = 24 h). + public int IntervalSeconds { get; set; } = 86_400; + + /// + /// Minimum number of future months that pf_AuditLog_Month must + /// cover after each tick. Default 1 — i.e. as of mid-May the partition + /// for the next full month (June) must already be present. + /// + public int LookaheadMonths { get; set; } = 1; +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs new file mode 100644 index 0000000..2aa02f8 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs @@ -0,0 +1,145 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.Commons.Interfaces; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central (M6-T5, Bundle D) that rolls +/// pf_AuditLog_Month forward once a day. Each tick opens a fresh DI +/// scope, resolves , and calls +/// to SPLIT any +/// missing future boundaries — the partition function must always cover at +/// least +/// future months, otherwise inserts past the highest boundary accumulate in +/// a single unbounded tail partition that SwitchOutPartitionAsync +/// cannot purge cleanly. +/// +/// +/// +/// Why a hosted service, not an actor. Bundle C's +/// sits inside the central singleton +/// because it needs supervised lifecycle alongside the rest of the +/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day +/// chore with no cross-actor coordination, so we use the much simpler +/// hosted-service pattern: Task.Run on start, Task.Delay +/// between ticks, cancellation on stop. Reusing +/// from the central node-only DI graph +/// keeps the contract testable without any actor framework involvement. +/// +/// +/// Failure containment. The tick body wraps the maintenance call in +/// a try/catch so a transient SQL Server error never tears down the hosted +/// service — the next tick simply retries. The exception is logged with +/// the original stack trace at Error level; ops surfaces (M6 Bundle +/// E's central health collector) can subscribe to the logger to alert on +/// repeated failures. +/// +/// +/// Startup ordering. A first tick fires immediately at +/// so a fresh deployment doesn't need to wait +/// for +/// the partition function to come up to spec. This is also what the brief +/// asks for ("Run once on startup"). +/// +/// +/// DI scope per tick. is scoped +/// (alongside the rest of the EF repositories) because the implementation +/// reuses the per-scope ScadaLinkDbContext. A hosted service is a +/// singleton, so it must open and dispose a scope around each tick — the +/// same pattern uses. +/// +/// +public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable +{ + private readonly IServiceScopeFactory _scopeFactory; + private readonly IOptions _options; + private readonly ILogger _logger; + private CancellationTokenSource? _cts; + private Task? _loop; + + public AuditLogPartitionMaintenanceService( + IServiceScopeFactory scopeFactory, + IOptions options, + ILogger logger) + { + _scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory)); + _options = options ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public Task StartAsync(CancellationToken ct) + { + // Linked CTS lets StopAsync's cancellation AND the host's shutdown + // token both terminate the loop; either side firing aborts the + // pending Task.Delay. + _cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + return Task.CompletedTask; + } + + private async Task RunLoopAsync(CancellationToken ct) + { + // Run once on startup so a fresh deployment isn't gated on the + // IntervalSeconds initial wait — the brief calls this out explicitly. + await SafeMaintainAsync(ct).ConfigureAwait(false); + + while (!ct.IsCancellationRequested) + { + try + { + await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct) + .ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + await SafeMaintainAsync(ct).ConfigureAwait(false); + } + } + + private async Task SafeMaintainAsync(CancellationToken ct) + { + try + { + await using var scope = _scopeFactory.CreateAsyncScope(); + var maintenance = scope.ServiceProvider.GetRequiredService(); + var added = await maintenance + .EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct) + .ConfigureAwait(false); + if (added.Count > 0) + { + _logger.LogInformation( + "AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}", + added.Count, + string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd")))); + } + } + catch (Exception ex) + { + // Catch-all is deliberate: the hosted service must survive every + // class of tick failure (transient SQL, DI resolution, etc.) so + // the next tick gets a chance. The brief's contract is + // "exception logged, not propagated". + _logger.LogError(ex, "AuditLogPartitionMaintenance tick failed"); + } + } + + /// + public Task StopAsync(CancellationToken ct) + { + _cts?.Cancel(); + return _loop ?? Task.CompletedTask; + } + + /// + public void Dispose() + { + _cts?.Dispose(); + } +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs new file mode 100644 index 0000000..153e238 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs @@ -0,0 +1,214 @@ +using System.Diagnostics; +using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Interfaces.Repositories; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition +/// purge. On a configurable timer (default 24 hours) the actor: +/// +/// Queries +/// for monthly boundaries whose latest OccurredAtUtc is older +/// than DateTime.UtcNow - RetentionDays. +/// For each eligible boundary, calls +/// which runs +/// the drop-and-rebuild dance around UX_AuditLog_EventId. +/// Publishes on the actor-system +/// EventStream so the Bundle E central health collector + ops surfaces +/// can subscribe without coupling to this actor. +/// +/// +/// +/// +/// Daily cadence. Partition switch is metadata-only but the +/// drop-and-rebuild dance briefly removes UX_AuditLog_EventId; running +/// more often than necessary trades unique-index rebuild outages for +/// negligible freshness wins. The default 24-hour interval matches +/// alog.md §10's retention policy. +/// +/// +/// Continue-on-error. A single boundary that throws (transient SQL +/// failure, contention with backup, missing object) must NOT prevent the +/// other eligible boundaries from being purged on the same tick. Per-boundary +/// work runs inside its own try/catch; the actor's +/// uses Resume so any leaked exception keeps +/// the singleton alive for the next tick. +/// +/// +/// DI scopes. is a scoped EF Core +/// service registered by AddConfigurationDatabase. The singleton +/// opens one DI scope per tick and reuses the same repository across every +/// boundary in that tick — mirrors the +/// pattern. +/// +/// +/// EventStream. Publishing through +/// the EventStream rather than direct messaging avoids coupling this actor +/// to its consumers; M6 Bundle E will subscribe a central health-counter +/// bridge that surfaces purge progress on the central health report. +/// +/// +public class AuditLogPurgeActor : ReceiveActor +{ + private readonly IServiceProvider _services; + private readonly AuditLogPurgeOptions _purgeOptions; + private readonly AuditLogOptions _auditOptions; + private readonly ILogger _logger; + private ICancelable? _timer; + + public AuditLogPurgeActor( + IServiceProvider services, + IOptions purgeOptions, + IOptions auditOptions, + ILogger logger) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(purgeOptions); + ArgumentNullException.ThrowIfNull(auditOptions); + ArgumentNullException.ThrowIfNull(logger); + + _services = services; + _purgeOptions = purgeOptions.Value; + _auditOptions = auditOptions.Value; + _logger = logger; + + ReceiveAsync(_ => OnTickAsync()); + } + + protected override void PreStart() + { + base.PreStart(); + var interval = _purgeOptions.Interval; + _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: PurgeTick.Instance, + sender: Self); + } + + protected override void PostStop() + { + _timer?.Cancel(); + base.PostStop(); + } + + /// + /// Resume keeps the singleton alive across any leaked exception. Restart + /// would re-run PreStart and reschedule the timer (harmless but wasteful); + /// Stop is wrong because the singleton must keep ticking until shutdown. + /// + protected override SupervisorStrategy SupervisorStrategy() + { + return new OneForOneStrategy( + maxNrOfRetries: 0, + withinTimeRange: TimeSpan.Zero, + decider: Akka.Actor.SupervisorStrategy.DefaultDecider); + } + + private async Task OnTickAsync() + { + // Capture EventStream BEFORE the first await. Accessing Context (and + // therefore Context.System) after an await is unsafe because Akka's + // ActorBase.Context throws "no active ActorContext" once the + // continuation runs on a thread that isn't currently dispatching this + // actor — mirrors the same Sender-capture pattern in + // AuditLogIngestActor.OnIngestAsync. + var eventStream = Context.System.EventStream; + + // Compute the retention threshold from AuditLogOptions.RetentionDays + // each tick — the options class supports hot reload via + // IOptionsMonitor for the redaction policy and similar settings; we + // read the snapshot per-tick so an operator who lowers RetentionDays + // sees the change applied on the next purge without an actor + // restart. + var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays); + + IServiceScope? scope = null; + IAuditLogRepository repository; + try + { + scope = _services.CreateScope(); + repository = scope.ServiceProvider.GetRequiredService(); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick."); + scope?.Dispose(); + return; + } + + try + { + IReadOnlyList boundaries; + try + { + boundaries = await repository + .GetPartitionBoundariesOlderThanAsync(threshold) + .ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError( + ex, + "Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.", + threshold); + return; + } + + if (boundaries.Count == 0) + { + return; + } + + foreach (var boundary in boundaries) + { + // Per-boundary try/catch: one bad partition (transient SQL + // failure, missing object, contention with backup) does NOT + // abandon the rest of the tick. + var sw = Stopwatch.StartNew(); + try + { + var rowsDeleted = await repository + .SwitchOutPartitionAsync(boundary) + .ConfigureAwait(false); + sw.Stop(); + + eventStream.Publish( + new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds)); + + _logger.LogInformation( + "Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.", + boundary, + rowsDeleted, + sw.ElapsedMilliseconds); + } + catch (Exception ex) + { + sw.Stop(); + _logger.LogError( + ex, + "Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.", + boundary, + sw.ElapsedMilliseconds); + } + } + } + finally + { + scope.Dispose(); + } + } + + /// Self-tick triggering a purge pass across all eligible partitions. + internal sealed class PurgeTick + { + public static readonly PurgeTick Instance = new(); + private PurgeTick() { } + } +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs new file mode 100644 index 0000000..5f9d824 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs @@ -0,0 +1,43 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central singleton. +/// Default cadence is 24 hours per the M6 plan; the retention window itself +/// is sourced from +/// (default 365) so operators tune retention from a single section. +/// +/// +/// +/// The purge actor is a daily-cadence singleton, not a hot-loop, because +/// partition-switch I/O is metadata-only but the drop-and-rebuild dance +/// briefly removes the UX_AuditLog_EventId unique index — running +/// more often than necessary trades index-rebuild outages for marginal +/// freshness gains. Lower this only when an operator can prove they need +/// sub-daily purge granularity. +/// +/// +/// exists for tests to drop the cadence to +/// milliseconds without polluting the production config surface; production +/// binds only. +/// +/// +public sealed class AuditLogPurgeOptions +{ + /// Period of the purge tick in hours (default 24). + public int IntervalHours { get; set; } = 24; + + /// + /// Test-only override for finer control over the tick cadence than + /// whole-hour resolution allows. When non-null, takes precedence over + /// . Not bound from config — production + /// config exposes only. + /// + public TimeSpan? IntervalOverride { get; set; } + + /// + /// Resolves the effective tick interval, honouring the test override + /// when set. Falls back to . + /// + public TimeSpan Interval => + IntervalOverride ?? TimeSpan.FromHours(IntervalHours); +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs new file mode 100644 index 0000000..78d4987 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs @@ -0,0 +1,29 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Published on the actor-system EventStream by +/// after each successful partition switch-out. Downstream consumers (Bundle E +/// central health collector, ops dashboards, audit trails) subscribe so a +/// purge action is observable without the actor needing to know about any +/// specific subscriber. +/// +/// +/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e. +/// the first instant of the purged month in UTC. +/// +/// +/// Approximate row count purged from the partition, sampled BEFORE the +/// switch. Exact accounting would require a post-switch scan of the staging +/// table, which the dance drops immediately, so this is the closest +/// observable proxy. Zero is a valid value when the actor's enumerator +/// included a partition the operator subsequently emptied by hand. +/// +/// +/// Wall-clock time spent inside SwitchOutPartitionAsync for this +/// boundary, in milliseconds. Useful for spotting the rare slow purge +/// without spinning up dedicated telemetry. +/// +public sealed record AuditLogPurgedEvent( + DateTime MonthBoundary, + long RowsDeleted, + long DurationMs); diff --git a/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs b/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs new file mode 100644 index 0000000..102b6d9 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs @@ -0,0 +1,57 @@ +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T9) — bridges +/// (incremented by +/// every time a header / body / SQL +/// parameter redactor stage throws and the filter has to over-redact the +/// offending field) into so the +/// failure surfaces on the central health surface as +/// AuditCentralHealthSnapshot.AuditRedactionFailure. +/// +/// +/// +/// Site vs central. M5 Bundle C wired the SITE-side bridge +/// (), +/// which routes increments into the site health report payload's +/// AuditRedactionFailure field. That handles redactor failures on the +/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the +/// MIRROR bridge here so the same payload filter — when it runs on the +/// central / +/// paths — surfaces its failures on the +/// central dashboard rather than disappearing into a NoOp. +/// +/// +/// Registration shape. Site composition roots call +/// , +/// which overrides the binding with the site bridge. Central composition +/// roots call , +/// which overrides with this central bridge. A node never wears both hats — +/// site and central are distinct host roles — so the two bridges never +/// fight over the same binding at runtime. +/// +/// +/// Why not a thin wrapper around the snapshot directly? The snapshot +/// itself could be the bound implementation (it already implements +/// ), but a dedicated class makes +/// the central-vs-site asymmetry explicit at the DI boundary — readers of +/// +/// see "site → site bridge, central → central bridge", matching the +/// +/// shape one-for-one. +/// +/// +public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter +{ + private readonly AuditCentralHealthSnapshot _snapshot; + + public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot) + { + _snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); + } + + /// + public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment(); +} diff --git a/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs b/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs index ff48bea..80bfc45 100644 --- a/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs @@ -42,6 +42,7 @@ public sealed class CentralAuditWriter : ICentralAuditWriter private readonly IServiceProvider _services; private readonly ILogger _logger; private readonly IAuditPayloadFilter? _filter; + private readonly ICentralAuditWriteFailureCounter _failureCounter; /// /// Bundle C (M5-T6) — the central direct-write path used by the @@ -50,15 +51,23 @@ public sealed class CentralAuditWriter : ICentralAuditWriter /// optional so the M4 test composition roots that don't pass one keep /// working (they only ever write small payloads); production DI registers /// the real filter via . + /// M6 Bundle E (T8) — adds the optional + /// so a swallowed repository + /// throw bumps the central health surface's + /// CentralAuditWriteFailures counter. Defaults to a NoOp so test + /// composition roots that don't wire the counter keep their current + /// behaviour. /// public CentralAuditWriter( IServiceProvider services, ILogger logger, - IAuditPayloadFilter? filter = null) + IAuditPayloadFilter? filter = null, + ICentralAuditWriteFailureCounter? failureCounter = null) { _services = services ?? throw new ArgumentNullException(nameof(services)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); _filter = filter; + _failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter(); } /// @@ -92,6 +101,19 @@ public sealed class CentralAuditWriter : ICentralAuditWriter catch (Exception ex) { // Audit failure NEVER aborts the user-facing action — swallow and log. + // M6 Bundle E (T8): also surface the failure on the central health + // counter so a sustained audit-write outage is visible on the + // health dashboard rather than disappearing into the log file. + try + { + _failureCounter.Increment(); + } + catch + { + // Counter must NEVER throw — defence in depth. Even if a + // misbehaving custom counter does, swallowing here keeps the + // best-effort contract intact. + } _logger.LogWarning( ex, "CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})", diff --git a/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs b/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs new file mode 100644 index 0000000..6b7fae2 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs @@ -0,0 +1,62 @@ +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side +/// audit-health counters: (every +/// repository insert throw from / +/// ), +/// (every payload-filter redactor throw on the central path), and +/// (per-site latched state from the +/// ). +/// +/// +/// +/// Read-only contract. Implementations expose a point-in-time snapshot +/// — increments and tracker updates happen through the dedicated counter / +/// tracker interfaces, not through this surface. Consumers (M7+ central +/// health pages) read these properties; they never mutate. +/// +/// +/// Why a parallel surface from . +/// aggregates per-site +/// SiteHealthState reports the SITE emits. The central audit-write +/// failure / redaction-failure counters originate ON central (no site report +/// carries them), so they live on a dedicated snapshot rather than being +/// retro-fitted into a per-site state. The two surfaces will be composed at +/// the M7 dashboard layer. +/// +/// +public interface IAuditCentralHealthSnapshot +{ + /// + /// Count of central-side audit-write failures since process start. + /// Incremented by every / + /// repository insert that throws. + /// + int CentralAuditWriteFailures { get; } + + /// + /// Count of central-side payload-filter redactor over-redactions since + /// process start. Incremented by every header / body / SQL-parameter + /// redactor stage that throws (the filter falls back to the + /// <redacted: redactor error> marker and never aborts the + /// user-facing action). Sites have their own counter + /// (-backed + /// SiteHealthReport.AuditRedactionFailure) and the central + /// composition root's binding routes ALL central redactor throws + /// (CentralAuditWriter + AuditLogIngestActor paths) into this counter. + /// + int AuditRedactionFailure { get; } + + /// + /// Per-site latched stalled state: true when the + /// has observed two + /// consecutive non-draining cycles for that site, false after the + /// first draining cycle. Sites absent from the map are interpreted as + /// healthy (Stalled=false default). Snapshot is a defensive + /// copy — readers must not mutate. + /// + IReadOnlyDictionary SiteAuditTelemetryStalled { get; } +} diff --git a/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs b/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs new file mode 100644 index 0000000..4e34256 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs @@ -0,0 +1,23 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit +/// writers (, ) +/// every time a repository InsertIfNotExistsAsync throws. Mirrors the +/// site-side +/// shape one-for-one — same one-method contract, same NoOp default, same +/// must-never-abort-the-user-facing-action invariant. +/// +/// +/// Audit-write failures NEVER abort the user-facing action (alog.md §13) — +/// the writer swallows the exception and surfaces the failure via this counter +/// instead. A NoOp default is the correct safe fallback while the central +/// health surface is being wired in; +/// is the production binding that routes increments into the aggregated +/// central health snapshot consumed by future M7+ pages. +/// +public interface ICentralAuditWriteFailureCounter +{ + /// Increment the central audit-write failure counter by one. + void Increment(); +} diff --git a/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs b/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs new file mode 100644 index 0000000..e094e48 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs @@ -0,0 +1,45 @@ +using ScadaLink.Commons.Messages.Integration; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Mockable abstraction over the central-side PullAuditEvents gRPC +/// client surface that uses to +/// fetch the next reconciliation batch from a specific site. Extracted so the +/// actor can be unit-tested against an in-memory stub without standing up a +/// real GrpcChannel per site. +/// +/// +/// +/// The production implementation (host wiring task) wraps the auto-generated +/// SiteStreamService.SiteStreamServiceClient, multiplexing one +/// GrpcChannel per site keyed on +/// . Until that wiring lands the DI +/// composition root binds a NoOp default that returns an empty response — the +/// reconciliation tick is still scheduled and the cursor logic still runs, so +/// regressions in the actor itself are caught even before the real client +/// arrives. +/// +/// +/// Implementations MUST NOT throw on transport faults that the actor can +/// tolerate (connection refused, deadline exceeded). The actor's contract is +/// "one site's failure doesn't sink the rest of the tick"; an exception still +/// won't crash the actor (the per-site try/catch catches it), but returning +/// an empty response on a known-recoverable error keeps the logs cleaner. +/// +/// +public interface IPullAuditEventsClient +{ + /// + /// Issues a PullAuditEvents RPC against the site whose endpoint + /// is registered against . Returns the next + /// batch of + /// rows ordered oldest-first AND a MoreAvailable flag the actor + /// uses to decide whether to fire another pull immediately. + /// + Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct); +} diff --git a/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs b/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs new file mode 100644 index 0000000..9e9607c --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs @@ -0,0 +1,34 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Enumeration surface consumed by to +/// discover which sites to poll on each reconciliation tick. Extracted so the +/// actor can be unit-tested against a static list without depending on the +/// production ISiteRepository + EF Core DbContext. +/// +/// +/// The production implementation wraps ISiteRepository.GetAllSitesAsync +/// and projects each Site to a using the +/// site's configured GrpcNodeAAddress (falling back to +/// GrpcNodeBAddress when NodeA is unset). Sites with NO gRPC address +/// configured are silently skipped — the reconciliation pull cannot reach +/// them, but absence of an address is a configuration decision, not a runtime +/// error. +/// +public interface ISiteEnumerator +{ + /// + /// Returns the current set of sites the reconciliation puller should visit + /// on the next tick. Implementations should reflect adds/removes promptly + /// — the actor calls this once per tick. + /// + Task> EnumerateAsync(CancellationToken ct = default); +} + +/// +/// One reconciliation target: the site identifier the actor uses as the +/// cursor key and the gRPC endpoint dials +/// to issue the pull. Endpoint is the bare authority (e.g. http://siteA:8083); +/// transport selection (TLS, keepalive, etc.) is the client's concern. +/// +public sealed record SiteEntry(string SiteId, string GrpcEndpoint); diff --git a/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs b/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs new file mode 100644 index 0000000..d4eb216 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs @@ -0,0 +1,17 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Default binding used when +/// the central health surface () has +/// not been wired (test composition roots, site-only hosts that incidentally +/// resolve a ). Drops every increment on the +/// floor. Mirrors . +/// +public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter +{ + /// + public void Increment() + { + // intentional no-op + } +} diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs new file mode 100644 index 0000000..e38e6d2 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -0,0 +1,332 @@ +using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation +/// pull loop. On a configurable timer (default 5 minutes) the actor walks every +/// known site, asks the site for any rows with +/// >= the site's last reconciled +/// cursor, ingests them idempotently into the central +/// , and advances the cursor. +/// +/// +/// +/// Self-healing telemetry, not a dispatcher. The push path +/// ( + +/// IngestAuditEvents) is the primary mechanism. This actor exists so a +/// missed push (gRPC blip, central restart, site offline) is eventually +/// repaired by central re-pulling whatever the site still has in +/// Pending/Forwarded state. Idempotency on +/// (M2 Bundle A's race-fix) makes duplicate +/// arrivals from both paths a silent no-op. +/// +/// +/// Cursor lifetime. The per-site LastReconciledAt watermark is +/// kept in-memory for the actor's lifetime. The cluster singleton normally +/// survives the host process; on a deliberate failover OR a singleton restart +/// the cursors reset to . That is conservative +/// but correct — the next tick simply asks for everything the site still has, +/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was +/// considered and rejected for M6: the cost of a write per tick outweighs the +/// rare benefit of avoiding one over-broad pull after a restart. +/// +/// +/// Stalled detection. The brief calls a site "stalled" when two +/// consecutive pull cycles BOTH return non-empty AND MoreAvailable=true +/// — i.e. the backlog isn't draining. The actor publishes +/// on the actor system's +/// EventStream so a future ICentralHealthCollector bridge (M6 Bundle E) +/// can flip the health metric without coupling this actor to the health +/// collection surface today. +/// +/// +/// Failure isolation. A single site that throws (DNS, transport, +/// repository write) must NOT prevent other sites from being polled on the +/// same tick. The per-site work runs inside its own try/catch; the actor's +/// supervisor strategy keeps it alive across any leaked exception with +/// 's Restart +/// semantics — restart resets the in-memory cursors, but as noted above that's +/// a safe (over-pull, idempotent) recovery. +/// +/// +/// DI scopes. is a scoped EF Core +/// service registered by AddConfigurationDatabase. The singleton actor +/// opens one DI scope per tick and reuses the same repository across all +/// sites in that tick — one DbContext per tick mirrors the +/// AuditLogIngestActor + NotificationOutboxActor pattern. +/// +/// +public class SiteAuditReconciliationActor : ReceiveActor +{ + private readonly ISiteEnumerator _sites; + private readonly IPullAuditEventsClient _client; + private readonly IServiceProvider _services; + private readonly SiteAuditReconciliationOptions _options; + private readonly ILogger _logger; + + /// + /// Per-site reconciliation watermark — the highest + /// seen for that site on a previous + /// tick. Asking for OccurredAtUtc >= cursor rather than > + /// is the site contract (); + /// duplicate-with-same-timestamp rows are filtered out by the idempotent + /// repository write. + /// + private readonly Dictionary _cursors = new(); + + /// + /// Per-site count of consecutive non-draining cycles. Resets to zero on the + /// first draining (or empty) cycle. + /// + private readonly Dictionary _nonDrainingCycles = new(); + + /// + /// Per-site latched stalled state — used so the actor only publishes a + /// transition when the + /// stalled flag actually changes, not on every tick while stalled. + /// + private readonly Dictionary _stalled = new(); + + private ICancelable? _timer; + + public SiteAuditReconciliationActor( + ISiteEnumerator sites, + IPullAuditEventsClient client, + IServiceProvider services, + IOptions options, + ILogger logger) + { + ArgumentNullException.ThrowIfNull(sites); + ArgumentNullException.ThrowIfNull(client); + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(options); + ArgumentNullException.ThrowIfNull(logger); + + _sites = sites; + _client = client; + _services = services; + _options = options.Value; + _logger = logger; + + ReceiveAsync(_ => OnTickAsync()); + } + + protected override void PreStart() + { + base.PreStart(); + var interval = _options.ReconciliationInterval; + _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: ReconciliationTick.Instance, + sender: Self); + } + + protected override void PostStop() + { + _timer?.Cancel(); + base.PostStop(); + } + + private async Task OnTickAsync() + { + // Capture EventStream BEFORE the first await. Accessing Context (and + // therefore Context.System) after an await is unsafe because Akka's + // ActorBase.Context throws "no active ActorContext" once the + // continuation runs on a thread that isn't currently dispatching this + // actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the + // AuditLogIngestActor.OnIngestAsync Sender-capture pattern. + var eventStream = Context.System.EventStream; + + IReadOnlyList sites; + try + { + sites = await _sites.EnumerateAsync().ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick."); + return; + } + + if (sites.Count == 0) + { + return; + } + + IServiceScope? scope = null; + IAuditLogRepository repository; + try + { + scope = _services.CreateScope(); + repository = scope.ServiceProvider.GetRequiredService(); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick."); + scope?.Dispose(); + return; + } + + try + { + foreach (var site in sites) + { + try + { + await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false); + } + catch (Exception ex) + { + // Catch-all per the failure-isolation invariant: one site's + // fault must not sink the rest of the tick. The cursor for + // the failing site is left at its previous value so the + // next tick retries the same window. + _logger.LogWarning( + ex, + "Reconciliation pull failed for site {SiteId}; other sites continue.", + site.SiteId); + } + } + } + finally + { + scope.Dispose(); + } + } + + /// + /// Issues one PullAuditEvents RPC against the site, ingests the + /// returned rows idempotently into the central repository, and advances + /// the cursor based on the maximum + /// observed. The brief's "saturate until backlog clears" intent is met by + /// the natural cadence — each tick issues one pull, and a backed-up site + /// drains across consecutive ticks. The stalled signal (two non-draining + /// ticks in a row) surfaces when that drain isn't keeping up. + /// + private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream) + { + var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; + var response = await _client.PullAsync( + site.SiteId, since, _options.BatchSize, CancellationToken.None) + .ConfigureAwait(false); + + var maxOccurred = since; + var nowUtc = DateTime.UtcNow; + foreach (var evt in response.Events) + { + try + { + // Idempotent repository write: duplicate EventIds (from a + // concurrent push, or a retry of this very pull) collapse to + // a no-op courtesy of M2 Bundle A's race-fix on + // InsertIfNotExistsAsync. + var ingested = evt with { IngestedAtUtc = nowUtc }; + await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false); + } + catch (Exception ex) + { + // Per-row catch so one bad event does not abandon the rest of + // the batch. The cursor still advances based on OccurredAtUtc + // — the row was returned by the site, so the next tick won't + // re-fetch it; if it permanently fails to persist, that's an + // operational concern surfaced by the log, not a hot-loop + // trigger. + _logger.LogError( + ex, + "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.", + evt.EventId, + site.SiteId); + } + + if (evt.OccurredAtUtc > maxOccurred) + { + maxOccurred = evt.OccurredAtUtc; + } + } + + _cursors[site.SiteId] = maxOccurred; + + var nonDraining = response.MoreAvailable && response.Events.Count > 0; + UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream); + } + + /// + /// Flips the per-site stalled flag based on whether this tick drained the + /// queue. A "draining" cycle is one where the server reported no more rows + /// available OR returned zero events. A "non-draining" cycle is the + /// inverse (events returned AND MoreAvailable=true). + /// + /// + /// The state machine: counter increments on each consecutive non-draining + /// tick. On reaching + /// the actor latches Stalled=true and publishes the transition; on + /// any subsequent draining tick the counter resets to zero AND, if the + /// latch is currently true, the actor publishes Stalled=false. Only + /// transitions are published — repeated ticks in the same state are + /// silent so a downstream subscriber doesn't see a flood of redundant + /// notifications. + /// + private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream) + { + var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior; + + if (draining) + { + _nonDrainingCycles[siteId] = 0; + if (wasStalled) + { + _stalled[siteId] = false; + eventStream.Publish( + new SiteAuditTelemetryStalledChanged(siteId, Stalled: false)); + } + return; + } + + var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1; + _nonDrainingCycles[siteId] = consecutive; + + if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled) + { + _stalled[siteId] = true; + eventStream.Publish( + new SiteAuditTelemetryStalledChanged(siteId, Stalled: true)); + } + } + + /// + /// Resume on any unhandled exception inside the receive — the singleton + /// MUST stay alive even if the per-tick try/catch leaks. Restart would + /// reset the cursors (safe but wasteful); Resume preserves them. + /// + protected override SupervisorStrategy SupervisorStrategy() + { + return new OneForOneStrategy( + maxNrOfRetries: 0, + withinTimeRange: TimeSpan.Zero, + decider: Akka.Actor.SupervisorStrategy.DefaultDecider); + } + + /// Self-tick triggering a reconciliation pass across all sites. + internal sealed class ReconciliationTick + { + public static readonly ReconciliationTick Instance = new(); + private ReconciliationTick() { } + } +} + +/// +/// Published on the actor system EventStream when a site's reconciliation +/// puller transitions into or out of the "stalled" state (backlog not +/// draining across multiple cycles). The M6 Bundle E central health collector +/// will subscribe to this and surface +/// SiteAuditTelemetryStalled on the health-report payload. +/// +public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled); diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs new file mode 100644 index 0000000..d32c5e6 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs @@ -0,0 +1,60 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central singleton. +/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per +/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty +/// AND MoreAvailable=true (the backlog is not draining). +/// +/// +/// +/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is +/// lost; it is intentionally low-frequency. Lowering +/// in production trades MS SQL load for +/// fresher self-healing — keep the default unless a deployment can prove the extra +/// load is acceptable. +/// +/// +/// = 2 because a single non-draining +/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the +/// stalled signal should only fire when the backlog persists across cycles, which is +/// the symptom the central health surface is asking us to detect. +/// +/// +public sealed class SiteAuditReconciliationOptions +{ + /// + /// Period of the reconciliation tick. Each tick visits every known site once. + /// + public int ReconciliationIntervalSeconds { get; set; } = 300; + + /// + /// Test-only override for finer control over the tick cadence than + /// whole-second resolution allows. When non-null, takes precedence over + /// . Not bound from config — + /// production config exposes + /// only. + /// + public TimeSpan? ReconciliationIntervalOverride { get; set; } + + /// + /// Resolves the effective tick interval, honouring the test override when + /// set. Falls back to . + /// + public TimeSpan ReconciliationInterval => + ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + + /// + /// Maximum number of + /// rows requested in a single PullAuditEvents RPC call. + /// + public int BatchSize { get; set; } = 256; + + /// + /// Number of consecutive non-draining cycles (events returned AND + /// MoreAvailable=true) that must accumulate for a site before the actor + /// publishes SiteAuditTelemetryStalledChanged(Stalled: true) on the + /// EventStream. + /// + public int StalledAfterNonDrainingCycles { get; set; } = 2; +} diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs new file mode 100644 index 0000000..e1ed0fd --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs @@ -0,0 +1,188 @@ +using System.Collections.Concurrent; +using Akka.Actor; +using Akka.Event; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the +/// actor system's EventStream for +/// publications and maintains a per-site latched stalled-state map readable +/// via . Consumed by the M6 Bundle E +/// aggregator so the central health +/// surface can surface per-site "reconciliation isn't draining" without +/// coupling the publisher () to the +/// health collection plumbing. +/// +/// +/// +/// Why an internal actor. Akka.NET's only +/// supports subscribers — there is no callback or +/// channel-based overload. The tracker therefore spawns a small subscriber +/// actor that forwards each event into the shared +/// on the actor's thread, and +/// readers () take a copy off that dictionary on any +/// thread. Mirrors the DeadLetterMonitorActor shape — subscribe in +/// , unsubscribe in +/// , which the tracker triggers via a Stop +/// at . +/// +/// +/// Per-site latching. The publisher () +/// only publishes on stalled-state transitions, so the dictionary is the +/// authoritative latched state. Sites that have never published are absent +/// from the snapshot — the consumer surface treats absence as +/// Stalled=false (default healthy), the same default the reconciliation +/// actor's own internal latch uses. +/// +/// +/// Singleton lifecycle. Registered as a singleton via +/// ; +/// tears the internal subscriber down at host shutdown. +/// +/// +public sealed class SiteAuditTelemetryStalledTracker : IDisposable +{ + private readonly EventStream _eventStream; + private readonly ConcurrentDictionary _state = new(); + private readonly IActorRef? _subscriber; + private readonly AuditCentralHealthSnapshot? _snapshot; + private bool _disposed; + + /// + /// Construct around a bare . Intended for unit + /// tests where the caller wants to publish events without standing up an + /// actor system — the tracker registers a transient subscriber actor only + /// if the supplied stream is backed by an actor system. In the bare-stream + /// mode (no actor system) the tracker still exposes the + /// surface but cannot self-subscribe; production + /// callers always go through . + /// + /// + /// Subscribing to requires an , + /// which can only be created from an . The bare- + /// stream ctor therefore can NOT itself wire the subscriber — tests that + /// want event-driven updates must use the ActorSystem ctor (or push state + /// directly via ). The tests in + /// SiteAuditTelemetryStalledTrackerTests use the ActorSystem ctor + /// via Akka.TestKit so they exercise the production subscribe path. + /// + public SiteAuditTelemetryStalledTracker(EventStream eventStream) + : this(eventStream, snapshot: null) + { + } + + /// + /// Bare-stream ctor with an optional snapshot sink — the central + /// composition root passes the singleton + /// so every dictionary update + /// also lands on the central health surface. The bare ctor still cannot + /// subscribe (no actor system), but tests that drive the tracker via + /// get the snapshot push for free. + /// + public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot) + { + _eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream)); + // No subscriber actor — see the remarks on the parameterless overload. + _subscriber = null; + _snapshot = snapshot; + } + + /// + /// Production ctor: subscribes a small internal actor to the supplied + /// system's EventStream so every published + /// updates the latched + /// per-site map. tears the subscriber down. + /// + public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem) + : this(actorSystem, snapshot: null) + { + } + + /// + /// Production ctor with a snapshot sink — every observed + /// is mirrored onto the + /// shared so the central health + /// surface sees per-site stalled state without re-reading the tracker. + /// + public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot) + { + ArgumentNullException.ThrowIfNull(actorSystem); + _eventStream = actorSystem.EventStream; + _snapshot = snapshot; + // Anonymous subscriber actor scoped to the system; props build it + // with a callback into THIS tracker's Apply method so the actor's + // single-threaded receive serialises every dictionary write. + _subscriber = actorSystem.ActorOf( + Props.Create(() => new StalledChangedSubscriber(this)), + name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}"); + // Subscribe synchronously from the ctor so the subscription is in + // place before the tracker is returned to the caller — the actor's + // own PreStart runs asynchronously and would otherwise race the + // first publish. EventStream.Subscribe is thread-safe. + _eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged)); + } + + /// + /// Returns a defensive copy of the per-site latched stalled state. + /// Absent sites are interpreted as Stalled=false by consumers. + /// + public IReadOnlyDictionary Snapshot() => + new Dictionary(_state); + + /// + /// Applied by the internal subscriber actor on every + /// publication. Exposed + /// internally so tests against the bare-stream ctor can still drive the + /// tracker, but the production path always goes through the actor. + /// + internal void Apply(SiteAuditTelemetryStalledChanged evt) + { + if (evt is null) return; + _state[evt.SiteId] = evt.Stalled; + // Mirror into the central health snapshot if wired so a reader of + // IAuditCentralHealthSnapshot sees the same per-site state without + // a second lookup. Snapshot is optional (test composition roots may + // skip it) so the null-coalesce is the safe path. + _snapshot?.ApplyStalled(evt); + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + if (_subscriber is not null) + { + // Unsubscribe runs in PostStop on the subscriber actor; Stop is + // fire-and-forget but the actor's PostStop hook is guaranteed to + // run before its mailbox is collected. + _subscriber.Tell(PoisonPill.Instance); + } + } + + /// + /// Internal subscriber actor — receives every + /// off the EventStream and + /// forwards it into the parent . + /// Unlike DeadLetterMonitorActor, the subscription is registered by + /// the tracker constructor BEFORE this actor begins processing messages so + /// publishes that arrive between actor creation and PreStart cannot be + /// missed. Unsubscribe still runs in . + /// + private sealed class StalledChangedSubscriber : ReceiveActor + { + private readonly SiteAuditTelemetryStalledTracker _parent; + + public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent) + { + _parent = parent; + Receive(evt => _parent.Apply(evt)); + } + + protected override void PostStop() + { + Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged)); + base.PostStop(); + } + } +} diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index cf04abd..626859f 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -1,6 +1,7 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.AuditLog.Central; @@ -43,6 +44,9 @@ public static class ServiceCollectionExtensions /// Configuration section bound to . public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry"; + /// Configuration section bound to . + public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance"; + /// /// Registers the Audit Log (#23) component services: options, the site /// SQLite writer chain (primary + ring fallback + failure-counter sink), @@ -151,6 +155,13 @@ public static class ServiceCollectionExtensions services.AddSingleton( sp => sp.GetRequiredService()); + // M6 Bundle E (T8): central audit-write failure counter — NoOp default + // for site/test composition roots that don't wire the central health + // snapshot. AddAuditLogCentralMaintenance below replaces this binding + // with the AuditCentralHealthSnapshot implementation so increments + // surface on the central dashboard. + services.TryAddSingleton(); + // M4 Bundle B: central direct-write audit writer used by // NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to // emit AuditLog rows that originate ON central, not via site telemetry. @@ -163,10 +174,13 @@ public static class ServiceCollectionExtensions // Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so // NotificationOutboxActor + Inbound API rows are truncated + redacted // before they hit MS SQL. + // M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter + // so swallowed repo throws bump the central health counter. services.AddSingleton(sp => new CentralAuditWriter( sp, sp.GetRequiredService>(), - sp.GetRequiredService())); + sp.GetRequiredService(), + sp.GetRequiredService())); return services; } @@ -214,6 +228,80 @@ public static class ServiceCollectionExtensions ServiceDescriptor.Singleton()); services.Replace( ServiceDescriptor.Singleton()); + // M6 Bundle E (T6): the site-side backlog reporter polls the + // SqliteAuditWriter every 30 s and pushes the snapshot into the + // collector so the next SiteHealthReport carries a fresh + // SiteAuditBacklog field. Registered alongside the other site-only + // metric bridges so AddAuditLog (which runs on central too) stays + // free of hosted-service registrations that would resolve a missing + // ISiteHealthCollector on central. + services.AddHostedService(); + return services; + } + + /// + /// Audit Log (#23) M6-T5 Bundle D — central-only registration for the + /// hosted service plus + /// its binding. Must be + /// called from the Central role's composition root (not from a site + /// composition root); the underlying IPartitionMaintenance + /// implementation is registered by AddConfigurationDatabase and + /// only exists on the central node. + /// + /// + /// + /// Separated from because AddAuditLog is + /// also invoked from site composition roots — silently starting a + /// hosted service that resolves an unregistered dependency on a site + /// would fail every tick. Keeping the central-only registration in its + /// own helper preserves the "every Add* call is safe to issue + /// from any composition root" invariant. + /// + /// + public static IServiceCollection AddAuditLogCentralMaintenance( + this IServiceCollection services, + IConfiguration config) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(config); + + services.AddOptions() + .Bind(config.GetSection(PartitionMaintenanceSectionName)); + services.AddHostedService(); + + // M6 Bundle E (T8 + T9): central health snapshot — a single object + // that owns the CentralAuditWriteFailures + AuditRedactionFailure + // Interlocked counters AND surfaces them on + // IAuditCentralHealthSnapshot. The same instance is bound to BOTH + // writer-side interfaces (ICentralAuditWriteFailureCounter + + // IAuditRedactionFailureCounter) so every central-side increment + // routes into the shared counters; site nodes keep their existing + // Site bridges (registered by AddAuditLogHealthMetricsBridge) so + // the same counter type does not shadow the site-side metric. + // The snapshot itself has no actor-system dependency — the + // per-site stalled latch is fed by SiteAuditTelemetryStalledTracker + // which the Akka bootstrap wires up after ActorSystem.Create returns + // (the tracker is NOT registered here because its construction + // requires ActorSystem, which is not a DI-resolvable singleton). + services.AddSingleton(); + services.AddSingleton( + sp => sp.GetRequiredService()); + services.Replace(ServiceDescriptor.Singleton( + sp => sp.GetRequiredService())); + // M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter + // (registered by AddAuditLog) with the CentralAuditRedactionFailureCounter + // bridge so payload-filter throws on CentralAuditWriter / + // AuditLogIngestActor paths surface on the central dashboard. The + // bridge is a thin wrapper around the AuditCentralHealthSnapshot + // singleton so all central redactor failures route into the same + // counter as CentralAuditWriteFailures. The site composition root + // overrides this binding AGAIN via AddAuditLogHealthMetricsBridge — + // central nodes do not call that bridge, so this is the final + // binding on a central host. Mirrors the M5 Bundle C + // HealthMetricsAuditRedactionFailureCounter shape one-for-one. + services.Replace(ServiceDescriptor.Singleton()); + return services; } } diff --git a/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs b/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs new file mode 100644 index 0000000..955832a --- /dev/null +++ b/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs @@ -0,0 +1,133 @@ +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.HealthMonitoring; + +namespace ScadaLink.AuditLog.Site; + +/// +/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that +/// periodically pulls a backlog snapshot from +/// and pushes it into so the next +/// emits a fresh +/// SiteAuditBacklog field on the site health report. +/// +/// +/// +/// Why a hosted service, not the report sender. Querying SQLite for the +/// backlog requires the queue's write lock; doing it inline in +/// would couple the collector +/// to and turn an in-memory snapshot read into +/// a synchronous I/O call on the report path. The hosted-service pattern keeps +/// the report path pure and the SQL probe off the report timing budget. +/// +/// +/// Cadence. 30 s by default — coarse enough to amortise the SQL probe +/// across many reports, fine enough that the central dashboard never lags by +/// more than one health-report interval. Tunable via +/// in a follow-up +/// if ops needs a different cadence; for M6 we hard-code the value because the +/// brief calls it out explicitly. +/// +/// +/// Failure containment. The probe call is wrapped in a try/catch so a +/// transient SQLite error never tears down the hosted service — the next tick +/// retries. Mirrors 's +/// "exception logged, not propagated" contract. +/// +/// +public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable +{ + /// + /// Default poll cadence. Half a typical 60 s health-report interval keeps + /// the snapshot fresh without spinning the SQL probe more often than + /// necessary. + /// + internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30); + + private readonly ISiteAuditQueue _queue; + private readonly ISiteHealthCollector _collector; + private readonly ILogger _logger; + private readonly TimeSpan _refreshInterval; + private CancellationTokenSource? _cts; + private Task? _loop; + + public SiteAuditBacklogReporter( + ISiteAuditQueue queue, + ISiteHealthCollector collector, + ILogger logger, + TimeSpan? refreshInterval = null) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _collector = collector ?? throw new ArgumentNullException(nameof(collector)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _refreshInterval = refreshInterval ?? DefaultRefreshInterval; + } + + /// + public Task StartAsync(CancellationToken ct) + { + // Linked CTS lets StopAsync's cancellation AND the host's shutdown + // token both terminate the loop; either side firing aborts the + // pending Task.Delay. + _cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + return Task.CompletedTask; + } + + private async Task RunLoopAsync(CancellationToken ct) + { + // First tick runs immediately so the very first health report after + // process start carries a real backlog snapshot — without this the + // dashboard would show null for the first 30 s after a deploy. + await SafeProbeAsync(ct).ConfigureAwait(false); + + while (!ct.IsCancellationRequested) + { + try + { + await Task.Delay(_refreshInterval, ct).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + await SafeProbeAsync(ct).ConfigureAwait(false); + } + } + + private async Task SafeProbeAsync(CancellationToken ct) + { + try + { + var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false); + _collector.UpdateSiteAuditBacklog(snapshot); + } + catch (OperationCanceledException) + { + // Shutdown — let the outer loop exit cleanly. + throw; + } + catch (Exception ex) + { + // Catch-all is deliberate: the hosted service must survive every + // class of probe failure (transient SQLite lock contention, disk + // I/O hiccup, …) so the next tick gets a chance. + _logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry."); + } + } + + /// + public Task StopAsync(CancellationToken ct) + { + _cts?.Cancel(); + return _loop ?? Task.CompletedTask; + } + + /// + public void Dispose() + { + _cts?.Dispose(); + } +} diff --git a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs index 789b572..bf5cb8b 100644 --- a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs @@ -2,9 +2,9 @@ using System.Threading.Channels; using Microsoft.Data.Sqlite; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.AuditLog.Site; @@ -390,6 +390,184 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable } } + /// + /// M6 reconciliation-pull read: returns up to rows + /// whose OccurredAtUtc >= sinceUtc and whose + /// is still or + /// . Forwarded rows are included so the + /// brief race window between a site-Forwarded ack and central ingest cannot + /// silently drop rows; central dedups on . + /// Ordered oldest first, EventId tiebreaker. + /// + public Task> ReadPendingSinceAsync( + DateTime sinceUtc, int batchSize, CancellationToken ct = default) + { + if (batchSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0."); + } + + // Mirror ReadPendingAsync: the write lock guards the single connection. + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId, + SourceSiteId, SourceInstanceId, SourceScript, Actor, Target, + Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail, + RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState + FROM AuditLog + WHERE ForwardState IN ($pending, $forwarded) + AND OccurredAtUtc >= $since + ORDER BY OccurredAtUtc ASC, EventId ASC + LIMIT $limit; + """; + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString()); + // Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc + // is stored on insert ("o" format) — string comparison is monotonic for + // that encoding so we can index-scan against it. + cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString( + "o", System.Globalization.CultureInfo.InvariantCulture)); + cmd.Parameters.AddWithValue("$limit", batchSize); + + var rows = new List(Math.Min(batchSize, 256)); + using var reader = cmd.ExecuteReader(); + while (reader.Read()) + { + rows.Add(MapRow(reader)); + } + + return Task.FromResult>(rows); + } + } + + /// + /// M6 reconciliation-pull commit: flips the supplied EventIds to + /// , but ONLY for rows currently in + /// or . + /// Rows already in are left untouched + /// (idempotent re-call). Non-existent ids are silent no-ops. + /// + public Task MarkReconciledAsync(IReadOnlyList eventIds, CancellationToken ct = default) + { + ArgumentNullException.ThrowIfNull(eventIds); + if (eventIds.Count == 0) + { + return Task.CompletedTask; + } + + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + using var cmd = _connection.CreateCommand(); + var sb = new System.Text.StringBuilder(); + sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ") + .Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN ("); + for (int i = 0; i < eventIds.Count; i++) + { + if (i > 0) sb.Append(','); + var p = $"$id{i}"; + sb.Append(p); + cmd.Parameters.AddWithValue(p, eventIds[i].ToString()); + } + sb.Append(");"); + cmd.CommandText = sb.ToString(); + cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString()); + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString()); + + cmd.ExecuteNonQuery(); + return Task.CompletedTask; + } + } + + /// + /// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot + /// of the site queue's pending count, the oldest pending row's + /// , and the on-disk file size. Called + /// by the site-side SiteAuditBacklogReporter hosted service on its + /// 30 s tick to refresh the SiteHealthReport.SiteAuditBacklog field. + /// + /// + /// The pending-count + oldest-row queries run inside the same write lock as + /// the hot-path INSERT batch so the snapshot is consistent against the + /// connection's view (no torn read of an in-flight transaction). The on-disk + /// size lookup happens OUTSIDE the lock — it's a stat() call on the file + /// path and doesn't touch the connection. In-memory and missing files + /// return 0 bytes (the snapshot is for ops dashboards, not a correctness + /// invariant). + /// + public Task GetBacklogStatsAsync(CancellationToken ct = default) + { + int pendingCount; + DateTime? oldestPending; + + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + // Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same + // index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred + // index makes both aggregates cheap (count is a covering scan, min + // is the first key). + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + SELECT COUNT(*), MIN(OccurredAtUtc) + FROM AuditLog + WHERE ForwardState = $pending; + """; + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + + using var reader = cmd.ExecuteReader(); + reader.Read(); + pendingCount = reader.GetInt32(0); + oldestPending = reader.IsDBNull(1) + ? null + : DateTime.Parse(reader.GetString(1), + System.Globalization.CultureInfo.InvariantCulture, + System.Globalization.DateTimeStyles.RoundtripKind); + } + + // File-size lookup outside the lock — the DatabasePath option is the + // canonical source. The connection-string-override branch (used by + // some tests) keeps the same DatabasePath value, so this works + // uniformly. In-memory / mode=memory paths return 0 because the file + // doesn't exist on disk. + long onDiskBytes = 0; + try + { + if (!string.IsNullOrEmpty(_options.DatabasePath) && + !_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) && + !_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) && + File.Exists(_options.DatabasePath)) + { + onDiskBytes = new FileInfo(_options.DatabasePath).Length; + } + } + catch (Exception ex) + { + // File system probe is a best-effort health-metric — never abort + // a backlog snapshot because stat() failed. Log and report 0. + _logger.LogDebug(ex, + "SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.", + _options.DatabasePath); + } + + return Task.FromResult(new SiteAuditBacklogSnapshot( + PendingCount: pendingCount, + OldestPendingUtc: oldestPending, + OnDiskBytes: onDiskBytes)); + } + + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc + ? value + : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + private static AuditEvent MapRow(SqliteDataReader reader) { return new AuditEvent diff --git a/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs b/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs deleted file mode 100644 index 9da55b5..0000000 --- a/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs +++ /dev/null @@ -1,34 +0,0 @@ -using ScadaLink.Commons.Entities.Audit; - -namespace ScadaLink.AuditLog.Site.Telemetry; - -/// -/// Site-local audit-log queue surface consumed by . -/// Extracted from so the telemetry actor can be -/// unit-tested against a stub without touching SQLite. -/// implements this interface; production wiring injects the same instance. -/// -/// -/// Only the two methods the drain loop needs are exposed — the hot-path -/// WriteAsync stays on -/// (script-thread surface), separated by concern from the -/// telemetry-actor surface so each side can be mocked independently. -/// -public interface ISiteAuditQueue -{ - /// - /// Returns up to rows currently in - /// , - /// oldest first. Idempotent — repeated calls before - /// will yield the same rows again. - /// - Task> ReadPendingAsync(int limit, CancellationToken ct = default); - - /// - /// Flips the supplied EventIds from - /// to - /// . - /// Non-existent or already-forwarded ids are silent no-ops. - /// - Task MarkForwardedAsync(IReadOnlyList eventIds, CancellationToken ct = default); -} diff --git a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs index a820cf5..724e1d1 100644 --- a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs +++ b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs @@ -3,6 +3,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.AuditLog.Telemetry; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Communication.Grpc; namespace ScadaLink.AuditLog.Site.Telemetry; diff --git a/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs b/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs new file mode 100644 index 0000000..b8b3ec5 --- /dev/null +++ b/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs @@ -0,0 +1,48 @@ +namespace ScadaLink.Commons.Interfaces; + +/// +/// Abstraction over the central AuditLog partition-function roll-forward +/// operation. M6-T5 introduces a daily-cadence hosted service +/// (AuditLogPartitionMaintenanceService) that calls +/// to make sure +/// pf_AuditLog_Month always has at least LookaheadMonths of +/// future boundaries available — otherwise inserts past the highest +/// boundary land in a single ever-growing tail partition that +/// SwitchOutPartitionAsync cannot purge cleanly. +/// +/// +/// +/// The interface lives in ScadaLink.Commons so the central hosted +/// service in ScadaLink.AuditLog can depend on it without taking a +/// reference on ScadaLink.ConfigurationDatabase; the EF-based +/// implementation ships in +/// ScadaLink.ConfigurationDatabase.Maintenance.AuditLogPartitionMaintenance +/// and is registered by AddConfigurationDatabase. +/// +/// +/// Both methods read sys.partition_range_values / mutate +/// pf_AuditLog_Month via raw SQL — there is no EF model for a +/// partition function. The interface deliberately exposes only the two +/// operations the hosted service needs; it is not a general partition-DDL +/// surface. +/// +/// +public interface IPartitionMaintenance +{ + /// + /// Splits new monthly boundaries on pf_AuditLog_Month so the + /// function covers at least future + /// months relative to . Idempotent — a + /// boundary that already exists is skipped rather than re-issued. + /// Returns the boundaries actually added, in chronological order. + /// + Task> EnsureLookaheadAsync(int lookaheadMonths, CancellationToken ct = default); + + /// + /// Reads the current maximum boundary value from + /// sys.partition_range_values for pf_AuditLog_Month. + /// Returns null when the partition function does not exist or + /// has no boundaries. + /// + Task GetMaxBoundaryAsync(CancellationToken ct = default); +} diff --git a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs index 7b15962..bcda482 100644 --- a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs +++ b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs @@ -45,12 +45,46 @@ public interface IAuditLogRepository /// /// Switches out (purges) the monthly partition whose lower bound is - /// . The honest M1 implementation throws - /// : the UX_AuditLog_EventId unique - /// index is non-partition-aligned (lives on [PRIMARY], not on - /// ps_AuditLog_Month), so SQL Server rejects - /// ALTER TABLE … SWITCH PARTITION until the drop-and-rebuild dance - /// shipped by the M6 purge actor is in place. + /// and returns the approximate number + /// of rows discarded — sampled inside the transaction BEFORE the switch + /// so the row count reflects what the switch removed, not a post-purge + /// scan of a table that no longer exists. /// - Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default); + /// + /// + /// Drop-and-rebuild dance. UX_AuditLog_EventId is intentionally + /// non-partition-aligned (it lives on [PRIMARY] so single-column + /// EventId uniqueness — required by — + /// can be enforced cheaply). SQL Server rejects + /// ALTER TABLE … SWITCH PARTITION while a non-aligned unique index + /// is present, so the M6 implementation drops the index, creates a staging + /// table with byte-identical schema, switches the partition's data into + /// staging, drops staging (discarding the rows), and rebuilds the unique + /// index. The CATCH branch guarantees the index is rebuilt even on partial + /// failure so the table never returns to live traffic without its + /// idempotency-supporting index. + /// + /// + /// Outage window. The dance briefly removes the unique index, so + /// concurrent calls during the switch + /// could in principle race past the IF NOT EXISTS check without the index + /// catching the duplicate. This is acceptable for the daily purge cadence + /// — the inserts that the IF NOT EXISTS check guards are themselves rare + /// enough that a sub-second collision window is operationally negligible, + /// and the composite PK still rejects same-(EventId, OccurredAtUtc) rows. + /// + /// + Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default); + + /// + /// Returns the set of pf_AuditLog_Month partition lower-bound + /// boundaries whose partitions contain only rows with + /// strictly older than + /// . Boundaries whose partition is empty are + /// excluded (a no-op switch is wasted work). Used by the M6 purge actor + /// to enumerate retention-eligible months on every tick. + /// + Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, + CancellationToken ct = default); } diff --git a/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs new file mode 100644 index 0000000..c9e0462 --- /dev/null +++ b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs @@ -0,0 +1,87 @@ +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Types; + +namespace ScadaLink.Commons.Interfaces.Services; + +/// +/// Site-local audit-log queue surface consumed by the site +/// SiteAuditTelemetryActor drain loop and the M6 +/// SiteStreamGrpcServer.PullAuditEvents reconciliation handler. +/// Extracted from SqliteAuditWriter so both consumers can be +/// unit-tested against a stub without touching SQLite; the +/// SqliteAuditWriter production type implements this interface +/// and DI wires the same singleton instance to every consumer. +/// +/// +/// Lives in Commons (rather than alongside SqliteAuditWriter in +/// ScadaLink.AuditLog) because ScadaLink.Communication — which +/// hosts the M6 gRPC pull handler — must depend on this interface and +/// ScadaLink.AuditLog already depends on ScadaLink.Communication. +/// Pulling the interface up to Commons breaks the would-be cycle while +/// keeping the implementation in the AuditLog component. +/// +/// Only the methods the drain and pull paths need are exposed — the +/// hot-path WriteAsync stays on +/// (script-thread surface), separated by concern so each side can be +/// mocked independently. +/// +public interface ISiteAuditQueue +{ + /// + /// Returns up to rows currently in + /// , + /// oldest first. Idempotent — repeated calls before + /// will yield the same rows again. + /// + Task> ReadPendingAsync(int limit, CancellationToken ct = default); + + /// + /// Flips the supplied EventIds from + /// to + /// . + /// Non-existent or already-forwarded ids are silent no-ops. + /// + Task MarkForwardedAsync(IReadOnlyList eventIds, CancellationToken ct = default); + + /// + /// M6 reconciliation-pull read surface: returns up to + /// rows whose >= + /// and whose is still + /// or + /// . + /// + /// + /// Rows in the brief race window between site-Forwarded and central-ingest are + /// intentionally included: the central reconciliation puller dedups on + /// , so re-shipping is safe and avoids losing rows + /// whose telemetry ack was acted on locally but never landed centrally. Ordering + /// is oldest first with + /// as the deterministic tiebreaker. + /// + Task> ReadPendingSinceAsync( + DateTime sinceUtc, int batchSize, CancellationToken ct = default); + + /// + /// M6 reconciliation-pull commit surface: flips the supplied EventIds to + /// , + /// but ONLY for rows currently in + /// or + /// . + /// Rows already in + /// are left untouched (idempotent re-call). Non-existent ids are silent no-ops. + /// + Task MarkReconciledAsync(IReadOnlyList eventIds, CancellationToken ct = default); + + /// + /// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot + /// of the site queue's pending count + oldest pending timestamp + on-disk + /// SQLite file size. Surfaced on + /// as + /// SiteAuditBacklog by the periodic SiteAuditBacklogReporter + /// hosted service so a stuck site→central drain is visible on the central + /// health dashboard. Safe to call concurrently with hot-path writes — + /// implementations are expected to take the same connection lock used by + /// the hot-path INSERT batch and the drain queries. + /// + Task GetBacklogStatsAsync(CancellationToken ct = default); +} diff --git a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs index bba4c8d..5567037 100644 --- a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs +++ b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs @@ -1,3 +1,4 @@ +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.Commons.Messages.Health; @@ -32,7 +33,14 @@ public record SiteHealthReport( // marker). Surfaces a misconfigured / catastrophic regex on // /monitoring/health. Defaults to 0 for back-compat with existing // producers and tests that don't construct the field. - int AuditRedactionFailure = 0); + int AuditRedactionFailure = 0, + // Audit Log (#23) M6 Bundle E (T6): point-in-time snapshot of the + // site-local SQLite audit-log queue (pending count, oldest pending row, + // on-disk bytes). Populated by the site-side SiteAuditBacklogReporter + // hosted service every 30 s. Defaults to null so existing producers / + // tests that don't refresh the snapshot stay valid; the central health + // surface treats null as "no data yet" rather than a zeroed queue. + SiteAuditBacklogSnapshot? SiteAuditBacklog = null); /// /// Broadcast wrapper used between central nodes to keep per-node diff --git a/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs b/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs new file mode 100644 index 0000000..687a743 --- /dev/null +++ b/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs @@ -0,0 +1,32 @@ +namespace ScadaLink.Commons.Types; + +/// +/// Audit Log (#23) M6 Bundle E (T6) — point-in-time snapshot of the site-local +/// SQLite audit-log queue health, surfaced on +/// as +/// SiteAuditBacklog and refreshed periodically by the +/// SiteAuditBacklogReporter hosted service. +/// +/// +/// Number of rows currently in +/// — i.e. +/// not yet acknowledged by central via either the push-telemetry or +/// reconciliation-pull paths. A persistently non-zero value with rising +/// indicates the site→central drain isn't +/// keeping up. +/// +/// +/// of +/// the oldest Pending row, or null if the queue is empty. Used by ops +/// to compute backlog age without a separate query. +/// +/// +/// Size of the SQLite file on disk in bytes, or 0 if the writer is +/// running against an in-memory database. Mirrors the 7-day retention +/// invariant (alog.md §10) — a steady file-size growth past the retention +/// window points at a stuck purge or a stuck forwarder. +/// +public sealed record SiteAuditBacklogSnapshot( + int PendingCount, + DateTime? OldestPendingUtc, + long OnDiskBytes); diff --git a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs index 1da14ec..8a92027 100644 --- a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs @@ -5,6 +5,7 @@ using Grpc.Core; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Messages.Audit; using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; @@ -36,6 +37,13 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // calls are sub-100 ms in steady state; a generous timeout absorbs a slow // MSSQL connection without surfacing as a gRPC failure on a healthy site. private static readonly TimeSpan AuditIngestAskTimeout = TimeSpan.FromSeconds(30); + // Audit Log (#23 M6): site-local queue handed in by AkkaHostedService on + // site roles so the central reconciliation puller's PullAuditEvents RPC + // can read Pending/Forwarded rows. Null when not wired (e.g. central-only + // host or test composing the server in isolation) — the handler treats + // the missing queue as "nothing to ship" and returns an empty response so + // central retries on its next reconciliation cycle. + private ISiteAuditQueue? _siteAuditQueue; /// /// Test-only constructor — kept internal so the DI container sees a @@ -102,6 +110,20 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase _auditIngestActor = proxy; } + /// + /// Hands the site-local (the same + /// SqliteAuditWriter singleton that backs + /// on the script thread) to the gRPC server so the M6 + /// RPC can serve central's reconciliation + /// pulls. Mirrors : wired post-construction + /// because the queue and the gRPC server are both DI singletons brought up + /// in independent orders on site startup. + /// + public void SetSiteAuditQueue(ISiteAuditQueue queue) + { + _siteAuditQueue = queue; + } + /// /// Number of currently active streaming subscriptions. Exposed for diagnostics. /// @@ -361,6 +383,144 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase return ack; } + /// + /// Audit Log (#23) M6 reconciliation pull RPC. Central asks the site for any + /// AuditLog rows whose OccurredAtUtc >= since_utc and whose + /// ForwardState is still Pending or Forwarded (i.e. not + /// yet confirmed reconciled), bounded by batch_size. The site responds + /// with the rows AND flips them to + /// + /// AFTER serializing the response. The flip is best-effort — if it fails + /// (e.g. SQLite disposed mid-call), rows stay Pending/Forwarded and central + /// pulls them again on the next reconciliation cycle. Idempotent. + /// + /// + /// When is not wired (central-only host or a + /// composition-root test exercising the server in isolation) the RPC returns + /// an empty response — central treats that as "nothing to ship" and retries + /// on its next cycle, which is the same self-healing semantics as the + /// SetAuditIngestActor wiring race window. + /// + public override async Task PullAuditEvents( + PullAuditEventsRequest request, + ServerCallContext context) + { + var queue = _siteAuditQueue; + if (queue is null) + { + _logger.LogWarning( + "PullAuditEvents invoked before SetSiteAuditQueue was called; returning empty response."); + return new PullAuditEventsResponse(); + } + + if (request.BatchSize <= 0) + { + // Mirrors the SubscribeInstance guard: reject malformed requests + // cleanly with InvalidArgument so the caller doesn't see a generic + // RpcException from the underlying SQLite parameter validation. + throw new RpcException(new GrpcStatus( + StatusCode.InvalidArgument, "batch_size must be > 0")); + } + + // sinceUtc defaults to DateTime.MinValue when the wrapper is absent — + // i.e. "pull from the beginning of recorded history", which is the + // intended behaviour for the very first reconciliation cycle. + var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + + IReadOnlyList events; + try + { + events = await queue.ReadPendingSinceAsync( + since, request.BatchSize, context.CancellationToken); + } + catch (Exception ex) + { + _logger.LogError(ex, + "ReadPendingSinceAsync failed for since={Since} batch={Batch}; returning empty response.", + since, request.BatchSize); + return new PullAuditEventsResponse(); + } + + var response = new PullAuditEventsResponse + { + // batch_size saturated → tell central to issue a follow-up pull + // with an advanced cursor. The site doesn't compute the cursor — + // central walks it forward from the last returned OccurredAtUtc. + MoreAvailable = events.Count >= request.BatchSize, + }; + foreach (var evt in events) + { + response.Events.Add(AuditEventToDto(evt)); + } + + // Flip to Reconciled AFTER projecting the response so a fault below the + // try/catch (mid-response, mid-flip) leaves the rows in Pending/Forwarded + // and central pulls them again next cycle. The flip itself is + // best-effort — its failure is a warning, not a fault, because central + // will dedup on EventId on the next pull. + var ids = new List(events.Count); + foreach (var evt in events) + { + ids.Add(evt.EventId); + } + + if (ids.Count > 0) + { + try + { + await queue.MarkReconciledAsync(ids, context.CancellationToken); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "MarkReconciledAsync failed after PullAuditEvents response of {Count} rows; rows stay Pending for retry.", + ids.Count); + } + } + + return response; + } + + /// + /// Inlined audit-event entity→DTO translation. Keep in sync with + /// AuditEventMapper.ToDto in ScadaLink.AuditLog.Telemetry — + /// the project-reference cycle (AuditLog → Communication) prevents calling + /// the AuditLog mapper directly. The shape mirrors the FromDto pair above. + /// + private static AuditEventDto AuditEventToDto(AuditEvent evt) + { + var dto = new AuditEventDto + { + EventId = evt.EventId.ToString(), + OccurredAtUtc = Google.Protobuf.WellKnownTypes.Timestamp.FromDateTime(EnsureUtc(evt.OccurredAtUtc)), + Channel = evt.Channel.ToString(), + Kind = evt.Kind.ToString(), + CorrelationId = evt.CorrelationId?.ToString() ?? string.Empty, + SourceSiteId = evt.SourceSiteId ?? string.Empty, + SourceInstanceId = evt.SourceInstanceId ?? string.Empty, + SourceScript = evt.SourceScript ?? string.Empty, + Actor = evt.Actor ?? string.Empty, + Target = evt.Target ?? string.Empty, + Status = evt.Status.ToString(), + ErrorMessage = evt.ErrorMessage ?? string.Empty, + ErrorDetail = evt.ErrorDetail ?? string.Empty, + RequestSummary = evt.RequestSummary ?? string.Empty, + ResponseSummary = evt.ResponseSummary ?? string.Empty, + PayloadTruncated = evt.PayloadTruncated, + Extra = evt.Extra ?? string.Empty, + }; + + if (evt.HttpStatus.HasValue) dto.HttpStatus = evt.HttpStatus.Value; + if (evt.DurationMs.HasValue) dto.DurationMs = evt.DurationMs.Value; + + return dto; + } + + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc + ? value + : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + private static string? NullIfEmpty(string? value) => string.IsNullOrEmpty(value) ? null : value; diff --git a/src/ScadaLink.Communication/Protos/sitestream.proto b/src/ScadaLink.Communication/Protos/sitestream.proto index 43ffbe3..5ceb709 100644 --- a/src/ScadaLink.Communication/Protos/sitestream.proto +++ b/src/ScadaLink.Communication/Protos/sitestream.proto @@ -9,6 +9,7 @@ service SiteStreamService { rpc SubscribeInstance(InstanceStreamRequest) returns (stream SiteStreamEvent); rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck); rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck); + rpc PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse); } message InstanceStreamRequest { @@ -119,3 +120,19 @@ message CachedTelemetryPacket { } message CachedTelemetryBatch { repeated CachedTelemetryPacket packets = 1; } + +// Audit Log (#23) M6 reconciliation pull: central→site request for any +// site-local AuditLog rows with OccurredAtUtc >= since_utc that have not yet +// been ingested centrally (ForwardState in {Pending, Forwarded}). The site +// flips returned rows to Reconciled after the response is on the wire. +// more_available signals batch_size was saturated so the caller knows to +// issue a follow-up pull with an advanced since_utc cursor. +message PullAuditEventsRequest { + google.protobuf.Timestamp since_utc = 1; + int32 batch_size = 2; +} + +message PullAuditEventsResponse { + repeated AuditEventDto events = 1; + bool more_available = 2; +} diff --git a/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs b/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs index 9639242..ccac2bb 100644 --- a/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs +++ b/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs @@ -68,21 +68,27 @@ namespace ScadaLink.Communication.Grpc { "bnREdG8SNwoLb3BlcmF0aW9uYWwYAiABKAsyIi5zaXRlc3RyZWFtLlNpdGVD", "YWxsT3BlcmF0aW9uYWxEdG8iSgoUQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gSMgoH", "cGFja2V0cxgBIAMoCzIhLnNpdGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5UGFj", - "a2V0KlwKB1F1YWxpdHkSFwoTUVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFV", - "QUxJVFlfR09PRBABEhUKEVFVQUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElU", - "WV9CQUQQAypdCg5BbGFybVN0YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQ", - "RUNJRklFRBAAEhYKEkFMQVJNX1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NU", - "QVRFX0FDVElWRRACKoUBCg5BbGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZF", - "TF9OT05FEAASEwoPQUxBUk1fTEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxf", - "TE9XX0xPVxACEhQKEEFMQVJNX0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZF", - "TF9ISUdIX0hJR0gQBDKFAgoRU2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2Ny", - "aWJlSW5zdGFuY2USIS5zaXRlc3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVz", - "dBobLnNpdGVzdHJlYW0uU2l0ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVk", - "aXRFdmVudHMSGy5zaXRlc3RyZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVz", - "dHJlYW0uSW5nZXN0QWNrElAKFUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNp", - "dGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLklu", - "Z2VzdEFja0IfqgIcU2NhZGFMaW5rLkNvbW11bmljYXRpb24uR3JwY2IGcHJv", - "dG8z")); + "a2V0IlsKFlB1bGxBdWRpdEV2ZW50c1JlcXVlc3QSLQoJc2luY2VfdXRjGAEg", + "ASgLMhouZ29vZ2xlLnByb3RvYnVmLlRpbWVzdGFtcBISCgpiYXRjaF9zaXpl", + "GAIgASgFIlwKF1B1bGxBdWRpdEV2ZW50c1Jlc3BvbnNlEikKBmV2ZW50cxgB", + "IAMoCzIZLnNpdGVzdHJlYW0uQXVkaXRFdmVudER0bxIWCg5tb3JlX2F2YWls", + "YWJsZRgCIAEoCCpcCgdRdWFsaXR5EhcKE1FVQUxJVFlfVU5TUEVDSUZJRUQQ", + "ABIQCgxRVUFMSVRZX0dPT0QQARIVChFRVUFMSVRZX1VOQ0VSVEFJThACEg8K", + "C1FVQUxJVFlfQkFEEAMqXQoOQWxhcm1TdGF0ZUVudW0SGwoXQUxBUk1fU1RB", + "VEVfVU5TUEVDSUZJRUQQABIWChJBTEFSTV9TVEFURV9OT1JNQUwQARIWChJB", + "TEFSTV9TVEFURV9BQ1RJVkUQAiqFAQoOQWxhcm1MZXZlbEVudW0SFAoQQUxB", + "Uk1fTEVWRUxfTk9ORRAAEhMKD0FMQVJNX0xFVkVMX0xPVxABEhcKE0FMQVJN", + "X0xFVkVMX0xPV19MT1cQAhIUChBBTEFSTV9MRVZFTF9ISUdIEAMSGQoVQUxB", + "Uk1fTEVWRUxfSElHSF9ISUdIEAQy4QIKEVNpdGVTdHJlYW1TZXJ2aWNlElUK", + "EVN1YnNjcmliZUluc3RhbmNlEiEuc2l0ZXN0cmVhbS5JbnN0YW5jZVN0cmVh", + "bVJlcXVlc3QaGy5zaXRlc3RyZWFtLlNpdGVTdHJlYW1FdmVudDABEkcKEUlu", + "Z2VzdEF1ZGl0RXZlbnRzEhsuc2l0ZXN0cmVhbS5BdWRpdEV2ZW50QmF0Y2ga", + "FS5zaXRlc3RyZWFtLkluZ2VzdEFjaxJQChVJbmdlc3RDYWNoZWRUZWxlbWV0", + "cnkSIC5zaXRlc3RyZWFtLkNhY2hlZFRlbGVtZXRyeUJhdGNoGhUuc2l0ZXN0", + "cmVhbS5Jbmdlc3RBY2sSWgoPUHVsbEF1ZGl0RXZlbnRzEiIuc2l0ZXN0cmVh", + "bS5QdWxsQXVkaXRFdmVudHNSZXF1ZXN0GiMuc2l0ZXN0cmVhbS5QdWxsQXVk", + "aXRFdmVudHNSZXNwb25zZUIfqgIcU2NhZGFMaW5rLkNvbW11bmljYXRpb24u", + "R3JwY2IGcHJvdG8z")); descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, new pbr::FileDescriptor[] { global::Google.Protobuf.WellKnownTypes.TimestampReflection.Descriptor, global::Google.Protobuf.WellKnownTypes.WrappersReflection.Descriptor, }, new pbr::GeneratedClrTypeInfo(new[] {typeof(global::ScadaLink.Communication.Grpc.Quality), typeof(global::ScadaLink.Communication.Grpc.AlarmStateEnum), typeof(global::ScadaLink.Communication.Grpc.AlarmLevelEnum), }, null, new pbr::GeneratedClrTypeInfo[] { @@ -95,7 +101,9 @@ namespace ScadaLink.Communication.Grpc { new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.IngestAck), global::ScadaLink.Communication.Grpc.IngestAck.Parser, new[]{ "AcceptedEventIds" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.SiteCallOperationalDto), global::ScadaLink.Communication.Grpc.SiteCallOperationalDto.Parser, new[]{ "TrackedOperationId", "Channel", "Target", "SourceSite", "Status", "RetryCount", "LastError", "HttpStatus", "CreatedAtUtc", "UpdatedAtUtc", "TerminalAtUtc" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryPacket), global::ScadaLink.Communication.Grpc.CachedTelemetryPacket.Parser, new[]{ "AuditEvent", "Operational" }, null, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryBatch), global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null) + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryBatch), global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest), global::ScadaLink.Communication.Grpc.PullAuditEventsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.PullAuditEventsResponse), global::ScadaLink.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null) })); } #endregion @@ -3862,6 +3870,482 @@ namespace ScadaLink.Communication.Grpc { } + /// + /// Audit Log (#23) M6 reconciliation pull: central→site request for any + /// site-local AuditLog rows with OccurredAtUtc >= since_utc that have not yet + /// been ingested centrally (ForwardState in {Pending, Forwarded}). The site + /// flips returned rows to Reconciled after the response is on the wire. + /// more_available signals batch_size was saturated so the caller knows to + /// issue a follow-up pull with an advanced since_utc cursor. + /// + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullAuditEventsRequest : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullAuditEventsRequest()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ScadaLink.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[10]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest(PullAuditEventsRequest other) : this() { + sinceUtc_ = other.sinceUtc_ != null ? other.sinceUtc_.Clone() : null; + batchSize_ = other.batchSize_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest Clone() { + return new PullAuditEventsRequest(this); + } + + /// Field number for the "since_utc" field. + public const int SinceUtcFieldNumber = 1; + private global::Google.Protobuf.WellKnownTypes.Timestamp sinceUtc_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public global::Google.Protobuf.WellKnownTypes.Timestamp SinceUtc { + get { return sinceUtc_; } + set { + sinceUtc_ = value; + } + } + + /// Field number for the "batch_size" field. + public const int BatchSizeFieldNumber = 2; + private int batchSize_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int BatchSize { + get { return batchSize_; } + set { + batchSize_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullAuditEventsRequest); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullAuditEventsRequest other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if (!object.Equals(SinceUtc, other.SinceUtc)) return false; + if (BatchSize != other.BatchSize) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + if (sinceUtc_ != null) hash ^= SinceUtc.GetHashCode(); + if (BatchSize != 0) hash ^= BatchSize.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + if (sinceUtc_ != null) { + size += 1 + pb::CodedOutputStream.ComputeMessageSize(SinceUtc); + } + if (BatchSize != 0) { + size += 1 + pb::CodedOutputStream.ComputeInt32Size(BatchSize); + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullAuditEventsRequest other) { + if (other == null) { + return; + } + if (other.sinceUtc_ != null) { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + SinceUtc.MergeFrom(other.SinceUtc); + } + if (other.BatchSize != 0) { + BatchSize = other.BatchSize; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + } + #endif + + } + + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullAuditEventsResponse : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullAuditEventsResponse()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ScadaLink.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[11]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse(PullAuditEventsResponse other) : this() { + events_ = other.events_.Clone(); + moreAvailable_ = other.moreAvailable_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse Clone() { + return new PullAuditEventsResponse(this); + } + + /// Field number for the "events" field. + public const int EventsFieldNumber = 1; + private static readonly pb::FieldCodec _repeated_events_codec + = pb::FieldCodec.ForMessage(10, global::ScadaLink.Communication.Grpc.AuditEventDto.Parser); + private readonly pbc::RepeatedField events_ = new pbc::RepeatedField(); + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public pbc::RepeatedField Events { + get { return events_; } + } + + /// Field number for the "more_available" field. + public const int MoreAvailableFieldNumber = 2; + private bool moreAvailable_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool MoreAvailable { + get { return moreAvailable_; } + set { + moreAvailable_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullAuditEventsResponse); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullAuditEventsResponse other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if(!events_.Equals(other.events_)) return false; + if (MoreAvailable != other.MoreAvailable) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + hash ^= events_.GetHashCode(); + if (MoreAvailable != false) hash ^= MoreAvailable.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + events_.WriteTo(output, _repeated_events_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + events_.WriteTo(ref output, _repeated_events_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + size += events_.CalculateSize(_repeated_events_codec); + if (MoreAvailable != false) { + size += 1 + 1; + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullAuditEventsResponse other) { + if (other == null) { + return; + } + events_.Add(other.events_); + if (other.MoreAvailable != false) { + MoreAvailable = other.MoreAvailable; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + events_.AddEntriesFrom(input, _repeated_events_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + events_.AddEntriesFrom(ref input, _repeated_events_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + } + #endif + + } + #endregion } diff --git a/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs b/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs index e7b9b33..d5fd944 100644 --- a/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs +++ b/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs @@ -55,6 +55,10 @@ namespace ScadaLink.Communication.Grpc { static readonly grpc::Marshaller __Marshaller_sitestream_IngestAck = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.IngestAck.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Marshaller __Marshaller_sitestream_CachedTelemetryBatch = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.PullAuditEventsRequest.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.PullAuditEventsResponse.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Method __Method_SubscribeInstance = new grpc::Method( @@ -80,6 +84,14 @@ namespace ScadaLink.Communication.Grpc { __Marshaller_sitestream_CachedTelemetryBatch, __Marshaller_sitestream_IngestAck); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Method __Method_PullAuditEvents = new grpc::Method( + grpc::MethodType.Unary, + __ServiceName, + "PullAuditEvents", + __Marshaller_sitestream_PullAuditEventsRequest, + __Marshaller_sitestream_PullAuditEventsResponse); + /// Service descriptor public static global::Google.Protobuf.Reflection.ServiceDescriptor Descriptor { @@ -108,6 +120,12 @@ namespace ScadaLink.Communication.Grpc { throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::System.Threading.Tasks.Task PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::ServerCallContext context) + { + throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); + } + } /// Client for SiteStreamService @@ -187,6 +205,26 @@ namespace ScadaLink.Communication.Grpc { { return CallInvoker.AsyncUnaryCall(__Method_IngestCachedTelemetry, null, options, request); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ScadaLink.Communication.Grpc.PullAuditEventsResponse PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullAuditEvents(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ScadaLink.Communication.Grpc.PullAuditEventsResponse PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::CallOptions options) + { + return CallInvoker.BlockingUnaryCall(__Method_PullAuditEvents, null, options, request); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullAuditEventsAsync(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullAuditEventsAsync(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullAuditEventsAsync(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::CallOptions options) + { + return CallInvoker.AsyncUnaryCall(__Method_PullAuditEvents, null, options, request); + } /// Creates a new instance of client from given ClientBaseConfiguration. [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] protected override SiteStreamServiceClient NewInstance(ClientBaseConfiguration configuration) @@ -203,7 +241,8 @@ namespace ScadaLink.Communication.Grpc { return grpc::ServerServiceDefinition.CreateBuilder() .AddMethod(__Method_SubscribeInstance, serviceImpl.SubscribeInstance) .AddMethod(__Method_IngestAuditEvents, serviceImpl.IngestAuditEvents) - .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry).Build(); + .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry) + .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents).Build(); } /// Register service method with a service binder with or without implementation. Useful when customizing the service binding logic. @@ -216,6 +255,7 @@ namespace ScadaLink.Communication.Grpc { serviceBinder.AddMethod(__Method_SubscribeInstance, serviceImpl == null ? null : new grpc::ServerStreamingServerMethod(serviceImpl.SubscribeInstance)); serviceBinder.AddMethod(__Method_IngestAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestAuditEvents)); serviceBinder.AddMethod(__Method_IngestCachedTelemetry, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestCachedTelemetry)); + serviceBinder.AddMethod(__Method_PullAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullAuditEvents)); } } diff --git a/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs new file mode 100644 index 0000000..cdbd54b --- /dev/null +++ b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs @@ -0,0 +1,218 @@ +using System.Globalization; +using Microsoft.Data.SqlClient; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.Commons.Interfaces; + +namespace ScadaLink.ConfigurationDatabase.Maintenance; + +/// +/// EF/SQL-Server implementation of that +/// rolls forward pf_AuditLog_Month by issuing +/// ALTER PARTITION FUNCTION … SPLIT RANGE for each missing future +/// monthly boundary. +/// +/// +/// +/// The class is scoped (registered alongside the other repositories in +/// AddConfigurationDatabase) because it shares +/// — the hosted service opens a per-tick DI scope, resolves a fresh instance, +/// and lets the scope's DbContext dispose with it. The class itself +/// holds no state between calls. +/// +/// +/// Idempotency model. Each tick reads the current max boundary from +/// sys.partition_range_values and only issues SPLIT RANGE for +/// boundaries that strictly follow it — a boundary already covered is never +/// re-issued, so the "boundary already exists" failure (SQL Server msg 7708 +/// / 7711) is avoided by construction rather than caught. The pre-check is +/// cheaper than the alternative TRY/CATCH around every SPLIT call and also +/// keeps the returned added list semantically precise. +/// +/// +/// Why "first of next month". The migration seeds boundaries on the +/// first-of-month at midnight UTC; we preserve that convention so the +/// resulting partition layout is uniform. +/// rounds an arbitrary timestamp up to the next first-of-month boundary +/// (e.g. 2026-05-20 → 2026-06-01), and +/// walks one month at a time from there. +/// +/// +/// Permissions. The migration's scadalink_audit_purger role +/// already carries ALTER ON SCHEMA::dbo, which is sufficient for +/// ALTER PARTITION FUNCTION SPLIT RANGE. No additional grant is +/// required. +/// +/// +public sealed class AuditLogPartitionMaintenance : IPartitionMaintenance +{ + private const string PartitionFunctionName = "pf_AuditLog_Month"; + private const string PartitionSchemeName = "ps_AuditLog_Month"; + private const string TargetFileGroup = "PRIMARY"; + + private readonly ScadaLinkDbContext _context; + private readonly ILogger _logger; + + public AuditLogPartitionMaintenance( + ScadaLinkDbContext context, + ILogger? logger = null) + { + _context = context ?? throw new ArgumentNullException(nameof(context)); + _logger = logger ?? NullLogger.Instance; + } + + /// + public async Task GetMaxBoundaryAsync(CancellationToken ct = default) + { + // CAST the sql_variant `value` column to datetime2(7) — every boundary in + // pf_AuditLog_Month is declared as datetime2(7) by the migration, so the + // cast never loses precision. + const string sql = @" +SELECT MAX(CAST(rv.value AS datetime2(7))) +FROM sys.partition_range_values rv +INNER JOIN sys.partition_functions pf ON rv.function_id = pf.function_id +WHERE pf.name = 'pf_AuditLog_Month';"; + + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + + try + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + var raw = await cmd.ExecuteScalarAsync(ct).ConfigureAwait(false); + if (raw is null || raw is DBNull) + { + return null; + } + + // ExecuteScalarAsync materialises datetime2 as DateTime with + // DateTimeKind.Unspecified; the boundary values are stored at + // UTC midnight by convention (migration seeds with 'T00:00:00'), + // so we re-tag the kind so downstream comparisons against + // DateTime.UtcNow stay in the same kind space. + var dt = (DateTime)raw; + return DateTime.SpecifyKind(dt, DateTimeKind.Utc); + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + } + + /// + public async Task> EnsureLookaheadAsync( + int lookaheadMonths, + CancellationToken ct = default) + { + if (lookaheadMonths < 1) + { + throw new ArgumentOutOfRangeException( + nameof(lookaheadMonths), + lookaheadMonths, + "Lookahead must be at least one month — the partition function would otherwise be allowed to fall behind 'now'."); + } + + var nowUtc = DateTime.UtcNow; + // Horizon: the FIRST-OF-MONTH that must be the strictly-greater-than + // max boundary after this call. Example: nowUtc = 2026-05-20 and + // lookaheadMonths = 1 → horizon = 2026-07-01 (so the partition for + // June 2026 is already in place by mid-May). + var horizon = NormalizeToFirstOfMonth(nowUtc).AddMonths(lookaheadMonths); + + var max = await GetMaxBoundaryAsync(ct).ConfigureAwait(false); + if (max is null) + { + // No partition function (e.g. migrations not applied) — nothing + // we can safely SPLIT against. Log and return; the absence is a + // genuine misconfiguration that other parts of the system will + // surface louder than we could here. + _logger.LogWarning( + "EnsureLookaheadAsync: partition function {PartitionFunctionName} not found; skipping.", + PartitionFunctionName); + return Array.Empty(); + } + + // Start splitting from the FIRST month strictly after max — if max is + // already first-of-month (the common case), that's max + 1 month; + // otherwise NormalizeToFirstOfMonth rounds up. + var next = NormalizeToFirstOfMonth(max.Value.AddDays(1)); + + // Edge case: max already past horizon → no work to do. + if (next > horizon) + { + return Array.Empty(); + } + + var added = new List(); + while (next <= horizon) + { + // Boundary literal must be a deterministic, culture-invariant ISO + // string — SQL Server parses it as datetime2 via implicit conversion. + // SPLIT RANGE does NOT accept @-parameters; the value is part of the + // DDL statement, so we render it directly. The format is + // guaranteed (yyyy-MM-ddTHH:mm:ss.fffffff) so there is no injection + // surface. + var literal = next.ToString("yyyy-MM-ddTHH:mm:ss.fffffff", CultureInfo.InvariantCulture); + + // Before every SPLIT we must (re-)set the NEXT USED filegroup on + // ps_AuditLog_Month. Even though the scheme was created with + // `ALL TO ([PRIMARY])` (which auto-populates NEXT USED once), SQL + // Server consumes that hint on the FIRST split — subsequent splits + // raise msg 7707 ("partition scheme … does not have any next used + // filegroup") unless NEXT USED is explicitly re-set. Re-issuing it + // before every split is idempotent and keeps the loop simple. + var sql = $@" +ALTER PARTITION SCHEME {PartitionSchemeName} NEXT USED [{TargetFileGroup}]; +ALTER PARTITION FUNCTION {PartitionFunctionName}() SPLIT RANGE ('{literal}');"; + + try + { + await _context.Database.ExecuteSqlRawAsync(sql, ct).ConfigureAwait(false); + added.Add(next); + } + catch (SqlException ex) + { + // Belt-and-braces: even though we read max-boundary first, an + // ALTER from another process could have raced us. Logging at + // Warning rather than Error because the desired end state + // (boundary present) is satisfied by either path. + _logger.LogWarning( + ex, + "EnsureLookaheadAsync: SPLIT RANGE for boundary {Boundary:o} failed; continuing.", + next); + } + + next = NextMonthBoundary(next); + } + + return added; + } + + /// + /// Rounds an arbitrary instant UP to the next first-of-month UTC. Inputs + /// that ARE already a first-of-month at midnight are returned as-is so + /// callers can compose this freely without double-incrementing. + /// + private static DateTime NormalizeToFirstOfMonth(DateTime instant) + { + var utc = instant.Kind == DateTimeKind.Utc + ? instant + : DateTime.SpecifyKind(instant, DateTimeKind.Utc); + + var firstOfThisMonth = new DateTime(utc.Year, utc.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return utc == firstOfThisMonth ? firstOfThisMonth : firstOfThisMonth.AddMonths(1); + } + + private static DateTime NextMonthBoundary(DateTime boundary) => + boundary.AddMonths(1); +} diff --git a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs index d88271f..d2d74ac 100644 --- a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs +++ b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs @@ -179,18 +179,246 @@ VALUES } /// - /// M1 honest contract: throws . The - /// UX_AuditLog_EventId unique index is non-aligned with - /// ps_AuditLog_Month (it lives on [PRIMARY] to keep - /// cheap), and SQL Server rejects - /// ALTER TABLE … SWITCH PARTITION when a non-aligned index is present. - /// The drop-and-rebuild dance that makes the switch legal ships with the M6 - /// purge actor. + /// M6-T4 production implementation of the drop-and-rebuild dance documented + /// on . /// - public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) + /// + /// + /// The staging table name is GUID-suffixed so concurrent purge attempts on + /// different boundaries cannot collide. The staging schema is byte-identical + /// to the live AuditLog table (same column types, lengths, + /// nullability, and clustered-key shape) — SQL Server's + /// ALTER TABLE … SWITCH PARTITION rejects any drift. Keep this CREATE + /// in sync with both the migration that ships the live table + /// (20260520142214_AddAuditLogTable) and + /// AuditLogEntityTypeConfiguration. + /// + /// + /// All five steps run inside an explicit transaction so the SWITCH + + /// staging-DROP are atomic from the perspective of a consumer reading via + /// snapshot isolation; the CATCH rolls back and runs an idempotent + /// "rebuild UX_AuditLog_EventId if it doesn't exist" so a partial failure + /// never leaves the live table without its idempotency-supporting unique + /// index. + /// + /// + public async Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) { - throw new NotSupportedException( - "AuditLog partition switch is blocked by the non-aligned UX_AuditLog_EventId " + - "unique index; the drop-and-rebuild dance ships in M6 (purge actor)."); + // GUID-suffixed staging name: prevents collision with any concurrent + // purge attempt and avoids polluting the AuditLog object namespace with + // a predictable identifier. + var stagingTableName = $"AuditLog_Staging_{Guid.NewGuid():N}"; + + // ISO 8601 in UTC — SQL Server's datetime2 literal parser accepts this + // unambiguously and the value is round-trip-safe across SET DATEFORMAT + // settings. + var monthBoundaryStr = monthBoundary.ToUniversalTime().ToString("yyyy-MM-dd HH:mm:ss"); + + // Two-statement batch: the first SELECT samples the per-partition row + // count BEFORE the dance so we can report it back to the purge actor; + // the second batch performs the drop-and-rebuild. We use OUTPUT-style + // variables wired through @@ROWCOUNT after the SWITCH is not viable + // because SWITCH is a metadata-only operation that doesn't move rows in + // a way @@ROWCOUNT can observe. + var sampleSql = $@" + SELECT COUNT_BIG(*) FROM dbo.AuditLog + WHERE $PARTITION.pf_AuditLog_Month(OccurredAtUtc) = + $partition.pf_AuditLog_Month('{monthBoundaryStr}');"; + + var sql = $@" + BEGIN TRY + BEGIN TRANSACTION; + + -- 1. Drop the non-aligned unique index. ALTER TABLE SWITCH refuses + -- to run while it exists. + IF EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog')) + DROP INDEX UX_AuditLog_EventId ON dbo.AuditLog; + + -- 2. Staging table on [PRIMARY] (non-partitioned) with column shapes + -- byte-identical to dbo.AuditLog. Any drift here causes SWITCH to + -- reject the operation with msg 4904/4915. + CREATE TABLE dbo.[{stagingTableName}] ( + EventId uniqueidentifier NOT NULL, + OccurredAtUtc datetime2(7) NOT NULL, + IngestedAtUtc datetime2(7) NULL, + Channel varchar(32) NOT NULL, + Kind varchar(32) NOT NULL, + CorrelationId uniqueidentifier NULL, + SourceSiteId varchar(64) NULL, + SourceInstanceId varchar(128) NULL, + SourceScript varchar(128) NULL, + Actor varchar(128) NULL, + Target varchar(256) NULL, + Status varchar(32) NOT NULL, + HttpStatus int NULL, + DurationMs int NULL, + ErrorMessage nvarchar(1024) NULL, + ErrorDetail nvarchar(max) NULL, + RequestSummary nvarchar(max) NULL, + ResponseSummary nvarchar(max) NULL, + PayloadTruncated bit NOT NULL, + Extra nvarchar(max) NULL, + ForwardState varchar(32) NULL, + CONSTRAINT PK_{stagingTableName} PRIMARY KEY CLUSTERED (EventId, OccurredAtUtc) + ) ON [PRIMARY]; + + -- 3. Switch the partition out. $partition.pf_AuditLog_Month returns + -- the partition number that contains the supplied boundary value; + -- SWITCH PARTITION N moves that partition's pages to the staging + -- table (metadata-only, no row copying). + DECLARE @partitionNumber int = $partition.pf_AuditLog_Month('{monthBoundaryStr}'); + DECLARE @sql nvarchar(max) = 'ALTER TABLE dbo.AuditLog SWITCH PARTITION ' + CAST(@partitionNumber AS nvarchar(10)) + ' TO dbo.[{stagingTableName}];'; + EXEC sp_executesql @sql; + + -- 4. Drop staging — the rows are discarded here. This is the purge. + DROP TABLE dbo.[{stagingTableName}]; + + -- 5. Rebuild the non-aligned unique index. Live traffic that hit the + -- table during steps 1-4 saw composite-PK uniqueness only; from + -- here on, single-column EventId uniqueness is restored. + CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY]; + + COMMIT TRANSACTION; + END TRY + BEGIN CATCH + IF @@TRANCOUNT > 0 ROLLBACK TRANSACTION; + + -- Best-effort staging cleanup. The DROP INDEX in step 1 is now + -- rolled back (so the index is back), but the staging table from + -- step 2 may or may not survive the rollback depending on the + -- failure point. Guard the DROP so a missing staging table doesn't + -- mask the original error. + IF OBJECT_ID('dbo.[{stagingTableName}]', 'U') IS NOT NULL DROP TABLE dbo.[{stagingTableName}]; + + -- Idempotent index rebuild — covers the niche case where ROLLBACK + -- failed to restore UX_AuditLog_EventId (or the failure happened + -- AFTER the COMMIT, which shouldn't be possible inside this TRY + -- but is cheap insurance). Without this, a failed switch could + -- leave the live table without its idempotency-supporting index. + IF NOT EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog')) + CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY]; + + -- Surface the original error to the caller — the purge actor logs + -- and continues with the next boundary. + THROW; + END CATCH;"; + + // Sample the row count before the switch. The sample is best-effort + // (no transaction wrapping the sample-then-switch pair) because the + // central singleton is the only writer to this RPC and a daily-purge + // tick doesn't compete with concurrent SwitchOut callers. A + // concurrent INSERT racing the sample under-reports by at most a + // few rows, which is acceptable for an "approximate" purged-row + // count surfaced via AuditLogPurgedEvent. + long rowsDeleted = 0; + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + try + { + await using (var sampleCmd = conn.CreateCommand()) + { + sampleCmd.CommandText = sampleSql; + var sampleResult = await sampleCmd.ExecuteScalarAsync(ct).ConfigureAwait(false); + if (sampleResult is not null && sampleResult is not DBNull) + { + rowsDeleted = Convert.ToInt64(sampleResult); + } + } + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + + await _context.Database.ExecuteSqlRawAsync(sql, ct); + return rowsDeleted; + } + + /// + /// Returns the set of pf_AuditLog_Month boundaries whose partition's + /// MAX(OccurredAtUtc) is strictly older than . + /// Boundaries with empty partitions are excluded — purging an empty + /// partition is wasted I/O. + /// + /// + /// + /// The CTE pulls every boundary value defined by the partition function and + /// joins it (via $PARTITION.pf_AuditLog_Month) to the live AuditLog + /// to compute per-partition MAX(OccurredAtUtc). The outer filter + /// keeps only those whose MAX is non-NULL (partition has rows) AND strictly + /// less than the threshold (every row is past retention). + /// + /// + /// Note: the query scans the live OccurredAtUtc column to compute + /// the MAX per partition. With IX_AuditLog_OccurredAtUtc on the + /// partition-aligned scheme this is a single index seek per partition; for + /// 24 partitions and a daily purge cadence the cost is negligible. + /// + /// + public async Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, + CancellationToken ct = default) + { + var thresholdUtc = threshold.ToUniversalTime(); + var thresholdStr = thresholdUtc.ToString("yyyy-MM-dd HH:mm:ss.fffffff"); + + // Per-partition MAX over the live table. We materialise the boundary + // list first (24 rows) then LEFT JOIN to the MAX aggregate so empty + // partitions surface as NULL and get filtered out by the WHERE clause. + var sql = $@" + WITH Boundaries AS ( + SELECT CAST(rv.value AS datetime2(7)) AS BoundaryValue, + rv.boundary_id AS BoundaryId + FROM sys.partition_range_values rv + INNER JOIN sys.partition_functions pf ON rv.function_id = pf.function_id + WHERE pf.name = 'pf_AuditLog_Month' + ) + SELECT b.BoundaryValue + FROM Boundaries b + CROSS APPLY ( + SELECT MAX(a.OccurredAtUtc) AS MaxOccurredAt + FROM dbo.AuditLog a + WHERE $PARTITION.pf_AuditLog_Month(a.OccurredAtUtc) = b.BoundaryId + 1 + ) x + WHERE x.MaxOccurredAt IS NOT NULL + AND x.MaxOccurredAt < CAST('{thresholdStr}' AS datetime2(7)) + ORDER BY b.BoundaryValue;"; + + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + + var results = new List(); + try + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false); + while (await reader.ReadAsync(ct).ConfigureAwait(false)) + { + results.Add(reader.GetDateTime(0)); + } + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + + return results; } } diff --git a/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs b/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs index bf79b29..d926f1e 100644 --- a/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs @@ -1,8 +1,10 @@ using Microsoft.AspNetCore.DataProtection; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.DependencyInjection; +using ScadaLink.Commons.Interfaces; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.ConfigurationDatabase.Maintenance; using ScadaLink.ConfigurationDatabase.Repositories; using ScadaLink.ConfigurationDatabase.Services; @@ -52,6 +54,13 @@ public static class ServiceCollectionExtensions services.AddScoped(); services.AddScoped(); + // #23 M6 Bundle D: IPartitionMaintenance drives the daily roll-forward + // of pf_AuditLog_Month from the central AuditLogPartitionMaintenanceService + // hosted service. Scoped because the implementation reuses the per-scope + // ScadaLinkDbContext for raw-SQL execution; the hosted service opens a + // fresh scope on each tick (mirrors AuditLogPurgeActor / AuditLogIngestActor). + services.AddScoped(); + services.AddDataProtection() .PersistKeysToDbContext(); diff --git a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs index bcd5f9e..a1ca37b 100644 --- a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs @@ -1,4 +1,5 @@ using ScadaLink.Commons.Messages.Health; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.HealthMonitoring; @@ -28,6 +29,15 @@ public interface ISiteHealthCollector /// AddAuditLogHealthMetricsBridge(). /// void IncrementAuditRedactionFailure(); + /// + /// Audit Log (#23) M6 Bundle E (T6) — replace the latest site-local + /// audit-queue backlog snapshot (pending count, oldest pending row, + /// on-disk file bytes) used by the next call. + /// Refreshed periodically by the SiteAuditBacklogReporter hosted + /// service so each report carries a recent point-in-time view of the + /// site→central drain health. + /// + void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot); void UpdateConnectionHealth(string connectionName, ConnectionHealth health); void RemoveConnection(string connectionName); void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved); diff --git a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs index 47567c9..6f55061 100644 --- a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs @@ -1,5 +1,6 @@ using System.Collections.Concurrent; using ScadaLink.Commons.Messages.Health; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.HealthMonitoring; @@ -15,6 +16,7 @@ public class SiteHealthCollector : ISiteHealthCollector private int _deadLetterCount; private int _siteAuditWriteFailures; private int _auditRedactionFailures; + private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog; private readonly ConcurrentDictionary _connectionStatuses = new(); private readonly ConcurrentDictionary _tagResolutionCounts = new(); private readonly ConcurrentDictionary _connectionEndpoints = new(); @@ -89,6 +91,18 @@ public class SiteHealthCollector : ISiteHealthCollector Interlocked.Increment(ref _auditRedactionFailures); } + /// + /// Audit Log (#23) M6 Bundle E (T6) — replace the latest backlog snapshot + /// from the site SQLite writer. The field is a single reference write + /// (volatile) so the next sees the most recent + /// snapshot — there is no count to reset, the report just carries forward + /// whatever was last refreshed. + /// + public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot) + { + _siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); + } + /// /// Update the health status for a named data connection. /// Called by DCL when connection state changes. @@ -207,6 +221,7 @@ public class SiteHealthCollector : ISiteHealthCollector ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0), ClusterNodes: _clusterNodes?.ToList(), SiteAuditWriteFailures: siteAuditWriteFailures, - AuditRedactionFailure: auditRedactionFailures); + AuditRedactionFailure: auditRedactionFailures, + SiteAuditBacklog: _siteAuditBacklog); } } diff --git a/src/ScadaLink.Host/Actors/AkkaHostedService.cs b/src/ScadaLink.Host/Actors/AkkaHostedService.cs index b8c5171..dce065a 100644 --- a/src/ScadaLink.Host/Actors/AkkaHostedService.cs +++ b/src/ScadaLink.Host/Actors/AkkaHostedService.cs @@ -34,6 +34,13 @@ public class AkkaHostedService : IHostedService private readonly CommunicationOptions _communicationOptions; private readonly ILogger _logger; private ActorSystem? _actorSystem; + /// + /// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker) + /// that this hosted service constructs at start time and must tear down + /// on shutdown — they don't fit the ActorSystem lifecycle but share its + /// process scope. + /// + private readonly List _trackedDisposables = new(); public AkkaHostedService( IServiceProvider serviceProvider, @@ -201,6 +208,31 @@ akka {{ public async Task StopAsync(CancellationToken cancellationToken) { + // Dispose auxiliary subscribers (e.g. SiteAuditTelemetryStalledTracker) + // BEFORE Akka shuts down so their EventStream unsubscribe calls run + // while the system is still alive. Per-tracker Dispose is wrapped in + // its own try so a misbehaving subscriber can't sink the shutdown. + // Snapshot the list inside a lock so a concurrent StartAsync (the + // test harness sometimes triggers a second start/stop interleaving) + // can't race the enumeration. Clearing the original list under the + // same lock leaves the next StartAsync with a clean slate. + IDisposable[] disposables; + lock (_trackedDisposables) + { + disposables = _trackedDisposables.ToArray(); + _trackedDisposables.Clear(); + } + foreach (var disposable in disposables) + { + try { disposable.Dispose(); } + catch (Exception ex) + { + _logger.LogWarning(ex, + "Auxiliary subscriber {Type} threw during shutdown", + disposable.GetType().Name); + } + } + if (_actorSystem != null) { _logger.LogInformation("Shutting down Akka.NET actor system via CoordinatedShutdown..."); @@ -349,6 +381,31 @@ akka {{ "AuditLogIngestActor singleton created (gRPC server bound: {GrpcBound})", grpcServer is not null); + // Audit Log (#23) M6 Bundle E (T7): subscribe the per-site stalled + // telemetry tracker to the actor system EventStream NOW that the + // system exists. The tracker mirrors every + // SiteAuditTelemetryStalledChanged publication (from + // SiteAuditReconciliationActor — wired in a later bundle) into the + // AuditCentralHealthSnapshot singleton so the central health surface + // sees per-site stalled state. The tracker is constructed here rather + // than in AddAuditLogCentralMaintenance because its ctor needs an + // ActorSystem, which is not a DI-resolvable singleton — it's owned + // by this hosted service. The snapshot singleton is resolvable; + // passing it in seeds the tracker's Apply() so both internal state + // and the snapshot stay in lock-step. + var auditCentralSnapshot = _serviceProvider + .GetService(); + if (auditCentralSnapshot is not null) + { + var stalledTracker = new ScadaLink.AuditLog.Central.SiteAuditTelemetryStalledTracker( + _actorSystem!, auditCentralSnapshot); + lock (_trackedDisposables) + { + _trackedDisposables.Add(stalledTracker); + } + _logger.LogInformation("SiteAuditTelemetryStalledTracker subscribed to EventStream"); + } + // Site Call Audit (#22) — central singleton mirrors the AuditLogIngest // and NotificationOutbox patterns. M3's dual-write transaction routes // SiteCalls upserts through AuditLogIngestActor's own scope-per-message @@ -605,7 +662,7 @@ akka {{ var siteAuditOptions = _serviceProvider .GetRequiredService>(); var siteAuditQueue = _serviceProvider - .GetRequiredService(); + .GetRequiredService(); var siteAuditClient = _serviceProvider .GetRequiredService(); var siteAuditLogger = _serviceProvider.GetRequiredService() @@ -640,6 +697,13 @@ akka {{ // handshake has completed". Streams opened before SetReady are already // rejected by SiteStreamGrpcServer with StatusCode.Unavailable. var grpcServer = _serviceProvider.GetService(); + // Audit Log (#23 M6): hand the site-local SqliteAuditWriter (which + // implements ISiteAuditQueue) to the gRPC server so the PullAuditEvents + // reconciliation RPC can serve central's pulls. Both the writer and the + // gRPC server are singletons — wiring this here keeps the dependency + // direction one-way (Host knows both; Communication doesn't reach back + // into AuditLog). + grpcServer?.SetSiteAuditQueue(siteAuditQueue); grpcServer?.SetReady(_actorSystem!); } } diff --git a/src/ScadaLink.Host/Program.cs b/src/ScadaLink.Host/Program.cs index b1119d1..3632824 100644 --- a/src/ScadaLink.Host/Program.cs +++ b/src/ScadaLink.Host/Program.cs @@ -84,6 +84,10 @@ try // IAuditLogRepository. The site writer chain is still registered (lazy // singletons) but is never resolved on a central node. builder.Services.AddAuditLog(builder.Configuration); + // #23 M6-T5 Bundle D — central-only hosted service that rolls + // pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance + // (registered below by AddConfigurationDatabase). + builder.Services.AddAuditLogCentralMaintenance(builder.Configuration); // Site Call Audit (#22) — central node owns the SiteCallAuditActor // singleton (M3 Bundle F). The extension itself currently registers // nothing — actor Props are constructed inline in AkkaHostedService — diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs index 36de05f..724ae68 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs @@ -214,7 +214,11 @@ public class AuditLogIngestActorTests : TestKit, IClassFixture _inner.QueryAsync(filter, paging, ct); - public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => _inner.SwitchOutPartitionAsync(monthBoundary, ct); + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + _inner.GetPartitionBoundariesOlderThanAsync(threshold, ct); } } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs new file mode 100644 index 0000000..4e65207 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs @@ -0,0 +1,154 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Interfaces; +using Xunit; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle D (#23 M6-T5) tests for . +/// All tests use an in-memory stub — +/// the real EF/MSSQL implementation is exercised by the +/// AuditLogPartitionMaintenanceTests integration suite in +/// ScadaLink.ConfigurationDatabase.Tests. This file is purely +/// about the hosted service's policy decisions (start/stop, exception +/// containment). +/// +public class AuditLogPartitionMaintenanceServiceTests +{ + /// + /// Recording stub — counts EnsureLookaheadAsync invocations and lets the + /// test inject an exception per invocation to drive the catch-all path. + /// + private sealed class RecordingMaintenance : IPartitionMaintenance + { + public int EnsureCallCount; + public Exception? ThrowOnce; + + public Task> EnsureLookaheadAsync(int lookaheadMonths, CancellationToken ct = default) + { + Interlocked.Increment(ref EnsureCallCount); + if (ThrowOnce is { } ex) + { + ThrowOnce = null; + throw ex; + } + return Task.FromResult>(Array.Empty()); + } + + public Task GetMaxBoundaryAsync(CancellationToken ct = default) => + Task.FromResult(DateTime.UtcNow.AddMonths(6)); + } + + /// + /// Captures logged exceptions so the catch-all assertion can prove + /// the exception was actually logged (not silently swallowed) and was + /// the exact instance the stub threw. + /// + private sealed class CapturingLogger : ILogger + { + public List<(LogLevel Level, Exception? Exception, string Message)> Entries { get; } = new(); + + public IDisposable? BeginScope(TState state) where TState : notnull => null; + + public bool IsEnabled(LogLevel logLevel) => true; + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + Entries.Add((logLevel, exception, formatter(state, exception))); + } + } + + private static IServiceProvider BuildProvider(IPartitionMaintenance maintenance) + { + var services = new ServiceCollection(); + // IPartitionMaintenance is registered as scoped by AddConfigurationDatabase; + // we mirror that here so the hosted service's CreateAsyncScope + + // GetRequiredService resolves the stub the test injected. + services.AddScoped(_ => maintenance); + return services.BuildServiceProvider(); + } + + [Fact] + public async Task StartStop_NoExceptions() + { + // Long interval so only the eager startup tick fires inside the test + // window — keeps assertions deterministic without relying on + // multiple cadence loops. + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = 1, + }); + var maintenance = new RecordingMaintenance(); + var sp = BuildProvider(maintenance); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + await svc.StartAsync(CancellationToken.None); + + // Spin briefly until the startup tick has fired — the loop's first + // SafeMaintainAsync runs on a background Task.Run continuation, so + // we can't synchronously rely on its completion. + var deadline = DateTime.UtcNow.AddSeconds(3); + while (Volatile.Read(ref maintenance.EnsureCallCount) < 1 && DateTime.UtcNow < deadline) + { + await Task.Delay(20); + } + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + Assert.True(maintenance.EnsureCallCount >= 1, $"expected at least 1 ensure call, got {maintenance.EnsureCallCount}"); + } + + [Fact] + public async Task SafeMaintain_ExceptionLogged_NotPropagated() + { + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = 1, + }); + // The injected exception fires on the FIRST EnsureLookaheadAsync call + // (the startup tick) — the hosted service must contain it and + // continue running. + var boom = new InvalidOperationException("simulated maintenance failure"); + var maintenance = new RecordingMaintenance { ThrowOnce = boom }; + var sp = BuildProvider(maintenance); + var logger = new CapturingLogger(); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + logger); + + // StartAsync must not throw even though the very first tick will fail. + await svc.StartAsync(CancellationToken.None); + + // Wait for the error to surface in the logger. + var deadline = DateTime.UtcNow.AddSeconds(3); + while (!logger.Entries.Any(e => e.Exception == boom) && DateTime.UtcNow < deadline) + { + await Task.Delay(20); + } + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + var errorEntry = Assert.Single(logger.Entries, e => e.Exception == boom); + Assert.Equal(LogLevel.Error, errorEntry.Level); + Assert.Equal(1, maintenance.EnsureCallCount); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs new file mode 100644 index 0000000..afa20bf --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs @@ -0,0 +1,376 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle C (#23 M6-T4) tests for . The fast, +/// schedule-only tests substitute a recording stub for +/// so the timer + per-boundary error-isolation +/// + event-publish machinery can be exercised without an MSSQL container. +/// The end-to-end "real partition gets switched out" assertion lives in the +/// repository tests (Bundle C of M6-T4); this actor file is purely about the +/// actor's policy decisions. +/// +public class AuditLogPurgeActorTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public AuditLogPurgeActorTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + /// + /// In-memory recording stub. Captures every + /// + every + /// so tests can assert which boundaries + /// the actor chose to purge and how many ticks it issued. Also lets a + /// specific boundary be configured to throw so the continue-on-error path + /// is exercisable. + /// + private sealed class RecordingRepo : IAuditLogRepository + { + public List ThresholdQueries { get; } = new(); + public List SwitchedBoundaries { get; } = new(); + public Func RowsPerBoundary { get; set; } = _ => 0L; + public DateTime? ThrowOnBoundary { get; set; } + public Exception? BoundaryException { get; set; } + + // The actor enumerator returns whichever list is configured here. + // Mutating this between ticks lets tests simulate "no longer + // eligible" boundaries on the second tick. + public List Boundaries { get; set; } = new(); + + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) => + Task.CompletedTask; + + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) + { + if (ThrowOnBoundary.HasValue && monthBoundary == ThrowOnBoundary.Value) + { + throw BoundaryException ?? new InvalidOperationException("simulated switch failure"); + } + SwitchedBoundaries.Add(monthBoundary); + return Task.FromResult(RowsPerBoundary(monthBoundary)); + } + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) + { + ThresholdQueries.Add(threshold); + return Task.FromResult>(Boundaries.ToArray()); + } + } + + private IServiceProvider BuildScopedProvider(IAuditLogRepository repo) + { + var services = new ServiceCollection(); + // Mirror AddConfigurationDatabase: IAuditLogRepository is scoped, so + // the actor opens a fresh scope per tick and resolves there. + services.AddScoped(_ => repo); + return services.BuildServiceProvider(); + } + + private IActorRef CreateActor( + IAuditLogRepository repo, + AuditLogPurgeOptions purgeOptions, + AuditLogOptions? auditOptions = null) + { + var sp = BuildScopedProvider(repo); + return Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions ?? new AuditLogOptions()), + NullLogger.Instance))); + } + + private static AuditLogPurgeOptions FastTickOptions(TimeSpan? interval = null) => new() + { + IntervalHours = 24, + IntervalOverride = interval ?? TimeSpan.FromMilliseconds(100), + }; + + /// + /// Subscribe a probe to the EventStream so the test can observe + /// publications synchronously. + /// + private Akka.TestKit.TestProbe SubscribePurged() + { + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + return probe; + } + + // --------------------------------------------------------------------- + // 1. Tick_Fires_OnDailyInterval + // --------------------------------------------------------------------- + + [Fact] + public void Tick_Fires_OnDailyInterval() + { + var repo = new RecordingRepo(); + CreateActor(repo, FastTickOptions()); + + // The first scheduled tick fires after the configured interval. We + // assert the visible side effect (the enumerator was called) rather + // than racing on internal state. + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1, + $"expected >= 1 enumerator call, got {repo.ThresholdQueries.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Tick_OldPartitions_SwitchedOut + // --------------------------------------------------------------------- + + [Fact] + public void Tick_OldPartitions_SwitchedOut() + { + var repo = new RecordingRepo + { + Boundaries = new List + { + new(2025, 11, 1, 0, 0, 0, DateTimeKind.Utc), + new(2025, 12, 1, 0, 0, 0, DateTimeKind.Utc), + }, + RowsPerBoundary = _ => 42L, + }; + + CreateActor(repo, FastTickOptions()); + + AwaitAssert( + () => + { + Assert.Contains(new DateTime(2025, 11, 1, 0, 0, 0, DateTimeKind.Utc), repo.SwitchedBoundaries); + Assert.Contains(new DateTime(2025, 12, 1, 0, 0, 0, DateTimeKind.Utc), repo.SwitchedBoundaries); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 3. Tick_NewerPartitions_Untouched + // --------------------------------------------------------------------- + + [Fact] + public void Tick_NewerPartitions_Untouched() + { + // The actor's contract: it only touches whatever the enumerator + // returns. The enumerator (in production) filters out non-eligible + // boundaries; here we simulate that by handing back an empty list + // and asserting the actor switched nothing despite the tick firing. + var repo = new RecordingRepo { Boundaries = new List() }; + + CreateActor(repo, FastTickOptions()); + + // Wait for at least one tick (visible via the enumerator call) then + // assert no switch happened. + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Empty(repo.SwitchedBoundaries); + } + + // --------------------------------------------------------------------- + // 4. Tick_PublishesPurgedEvent_WithRowCount + // --------------------------------------------------------------------- + + [Fact] + public void Tick_PublishesPurgedEvent_WithRowCount() + { + var boundary = new DateTime(2025, 6, 1, 0, 0, 0, DateTimeKind.Utc); + var repo = new RecordingRepo + { + Boundaries = new List { boundary }, + RowsPerBoundary = _ => 1234L, + }; + + var probe = SubscribePurged(); + CreateActor(repo, FastTickOptions()); + + var msg = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(boundary, msg.MonthBoundary); + Assert.Equal(1234L, msg.RowsDeleted); + Assert.True(msg.DurationMs >= 0, + $"DurationMs should be non-negative; was {msg.DurationMs}"); + } + + // --------------------------------------------------------------------- + // 5. Tick_SwitchThrows_OtherPartitionsStillProcessed (continue-on-error) + // --------------------------------------------------------------------- + + [Fact] + public void Tick_SwitchThrows_OtherPartitionsStillProcessed() + { + var poisonBoundary = new DateTime(2025, 7, 1, 0, 0, 0, DateTimeKind.Utc); + var goodBoundary = new DateTime(2025, 8, 1, 0, 0, 0, DateTimeKind.Utc); + var repo = new RecordingRepo + { + Boundaries = new List { poisonBoundary, goodBoundary }, + ThrowOnBoundary = poisonBoundary, + BoundaryException = new InvalidOperationException("simulated switch failure for poison boundary"), + }; + + CreateActor(repo, FastTickOptions()); + + AwaitAssert( + () => + { + // The good boundary was still switched even though the poison + // boundary threw. + Assert.Contains(goodBoundary, repo.SwitchedBoundaries); + Assert.DoesNotContain(poisonBoundary, repo.SwitchedBoundaries); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 6. EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Today is ~2026-05-20 per the test environment. With RetentionDays = + // 60 the actor computes threshold ≈ 2026-03-21: + // * Jan partition (MAX = Jan 15) → older than threshold → PURGED + // * Apr partition (MAX = Apr 15) → newer than threshold → KEPT + var siteId = "purge-e2e-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEvt = new AuditEvent + { + EventId = Guid.NewGuid(), + OccurredAtUtc = new DateTime(2026, 1, 15, 0, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + var aprEvt = new AuditEvent + { + EventId = Guid.NewGuid(), + OccurredAtUtc = new DateTime(2026, 4, 15, 0, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + + await using (var seedContext = CreateMsSqlContext()) + { + var seedRepo = new AuditLogRepository(seedContext); + await seedRepo.InsertIfNotExistsAsync(janEvt); + await seedRepo.InsertIfNotExistsAsync(aprEvt); + } + + // Wire the actor's DI scope to the real repository against the + // fixture's MSSQL database. The actor opens a fresh scope per tick, + // so register the context as scoped (mirroring the production + // AddConfigurationDatabase wiring). + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var auditOptions = new AuditLogOptions { RetentionDays = 60 }; + var purgeOptions = new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }; + + var probe = SubscribePurged(); + Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions), + NullLogger.Instance))); + + // The probe receives one AuditLogPurgedEvent per partition the actor + // purges per tick — other test runs that share the fixture DB may + // also leave behind eligible partitions, but this test creates its + // own fixture DB so the Jan-2026 partition is the only eligible one. + // Use FishForMessage to filter just in case, with a generous timeout + // because the real drop-and-rebuild dance against MSSQL routinely + // takes a couple of seconds on a busy dev container. + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + var matched = probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + Assert.True(matched.RowsDeleted >= 1, + $"Expected RowsDeleted >= 1 for the Jan-2026 partition; got {matched.RowsDeleted}."); + + // Settle: allow any in-flight tick to commit before reading. + await Task.Delay(TimeSpan.FromMilliseconds(500)); + await using var verifyContext = CreateMsSqlContext(); + var rows = await verifyContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.DoesNotContain(rows, r => r.EventId == janEvt.EventId); + Assert.Contains(rows, r => r.EventId == aprEvt.EventId); + } + + private ScadaLinkDbContext CreateMsSqlContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + // --------------------------------------------------------------------- + // 7. Threshold_UsesAuditLogOptionsRetentionDays + // --------------------------------------------------------------------- + + [Fact] + public void Threshold_UsesAuditLogOptionsRetentionDays() + { + // The actor computes the threshold from AuditLogOptions.RetentionDays; + // assert the enumerator received a threshold whose value is in the + // expected window (today - retentionDays) rather than DateTime.MinValue + // or some other accidental default. We use a non-default retention + // (30 days) so the assertion isn't satisfied by the 365 default. + var repo = new RecordingRepo(); + CreateActor( + repo, + FastTickOptions(), + auditOptions: new AuditLogOptions { RetentionDays = 30 }); + + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + var threshold = repo.ThresholdQueries[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(30); + // 1-minute slack covers test-thread scheduling jitter between the + // tick firing and the assertion running. + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"threshold {threshold:o} should be within 1 minute of {expected:o}"); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs new file mode 100644 index 0000000..795841b --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs @@ -0,0 +1,98 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.AuditLog; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T9) coverage for the central-side payload-filter redactor +/// failure bridge. M5 wired the SITE bridge +/// (HealthMetricsAuditRedactionFailureCounter) that pushes increments +/// into the site health report; M6 mirrors that with +/// so the same payload +/// filter — when it runs on the central writer paths — surfaces failures on +/// the central . +/// +public class CentralAuditRedactionFailureCounterTests : TestKit +{ + [Fact] + public void Increment_Routes_To_Snapshot() + { + var snapshot = new AuditCentralHealthSnapshot(); + var counter = new CentralAuditRedactionFailureCounter(snapshot); + + counter.Increment(); + counter.Increment(); + counter.Increment(); + + Assert.Equal(3, snapshot.AuditRedactionFailure); + } + + [Fact] + public void Construction_With_Null_Snapshot_Throws() + { + Assert.Throws( + () => new CentralAuditRedactionFailureCounter(null!)); + } + + [Fact] + public void AddAuditLogCentralMaintenance_Replaces_IAuditRedactionFailureCounter_With_CentralImpl() + { + // AddAuditLog registers NoOp; AddAuditLogCentralMaintenance is the + // override path. The replaced binding MUST resolve to the central + // bridge — a site host that wires AddAuditLogHealthMetricsBridge + // instead would resolve to the site bridge (covered in + // AddAuditLogTests). + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + // AuditCentralHealthSnapshot no longer takes a tracker dependency — + // the tracker is constructed later by the Akka bootstrap because its + // ctor needs an ActorSystem (not a DI-resolvable singleton). The + // snapshot itself composes purely from primitives. + services.AddAuditLog(config); + services.AddAuditLogCentralMaintenance(config); + using var provider = services.BuildServiceProvider(); + + var counter = provider.GetRequiredService(); + + Assert.IsType(counter); + } + + [Fact] + public void AddAuditLog_Default_IAuditRedactionFailureCounter_Is_NoOp() + { + // Sanity check: without AddAuditLogCentralMaintenance the default + // remains the NoOp from M5 — the central bridge only takes effect + // when the central-only registration runs. + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + services.AddAuditLog(config); + using var provider = services.BuildServiceProvider(); + + var counter = provider.GetRequiredService(); + + Assert.IsType(counter); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs new file mode 100644 index 0000000..32b0a9a --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs @@ -0,0 +1,160 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Messages.Audit; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T8) regression coverage for the central-side audit-write +/// failure counter. and +/// both swallow repository throws (audit +/// must NEVER abort the user-facing action, alog.md §13) but bump the +/// so the central health +/// surface () can flag a sustained +/// outage. +/// +public class CentralAuditWriteFailuresTests : TestKit +{ + private static AuditEvent NewEvent() => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = DateTime.UtcNow, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + }; + + /// + /// Repository stub that always throws on insert — exercises the failure + /// path in both and + /// . + /// + private sealed class ThrowingRepo : IAuditLogRepository + { + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) => + throw new InvalidOperationException("simulated repo failure"); + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + Task.FromResult(0L); + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// + /// In-memory recording + /// every call so tests can assert on the count. + /// + private sealed class RecordingFailureCounter : ICentralAuditWriteFailureCounter + { + private int _count; + public int Count => Volatile.Read(ref _count); + public void Increment() => Interlocked.Increment(ref _count); + } + + [Fact] + public async Task Forced_Failure_Increments_Counter() + { + // Direct test: build the writer with a throwing scope and verify the + // injected counter is bumped on the swallowed insert exception. + var counter = new RecordingFailureCounter(); + var services = new ServiceCollection(); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var writer = new CentralAuditWriter( + sp, + NullLogger.Instance, + filter: null, + failureCounter: counter); + + // WriteAsync swallows the exception and increments the counter. + await writer.WriteAsync(NewEvent()); + + Assert.Equal(1, counter.Count); + } + + [Fact] + public async Task AuditLogIngestActor_Failure_Increments_Counter() + { + // The actor's production ctor resolves both IAuditLogRepository AND + // ICentralAuditWriteFailureCounter from the scope per-message; we + // register both and verify the per-row catch bumps the counter for + // every row in the batch. + var counter = new RecordingFailureCounter(); + var services = new ServiceCollection(); + services.AddScoped(); + // Counter is a singleton — the actor's per-message scope still + // resolves the same instance via the scope's parent provider. + services.AddSingleton(counter); + var sp = services.BuildServiceProvider(); + + var actor = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor( + sp, NullLogger.Instance))); + + var batch = new[] { NewEvent(), NewEvent(), NewEvent() }; + var reply = await actor.Ask( + new IngestAuditEventsCommand(batch), TimeSpan.FromSeconds(5)); + + // Every row threw → none accepted, counter bumped once per row. + Assert.Empty(reply.AcceptedEventIds); + Assert.Equal(batch.Length, counter.Count); + } + + [Fact] + public void Snapshot_Aggregates_Counters_And_StalledState() + { + // AuditCentralHealthSnapshot implements both writer surfaces; bumping + // through the writer interfaces is reflected on the read surface, and + // the per-site stalled state is fed in via ApplyStalled — production + // wires that to a SiteAuditTelemetryStalledTracker, but the snapshot + // is testable in isolation against the same Apply surface. + var snapshot = new AuditCentralHealthSnapshot(); + + Assert.Equal(0, snapshot.CentralAuditWriteFailures); + Assert.Equal(0, snapshot.AuditRedactionFailure); + Assert.Empty(snapshot.SiteAuditTelemetryStalled); + + ((ICentralAuditWriteFailureCounter)snapshot).Increment(); + ((ICentralAuditWriteFailureCounter)snapshot).Increment(); + ((ScadaLink.AuditLog.Payload.IAuditRedactionFailureCounter)snapshot).Increment(); + + // Wire the tracker so an EventStream publish reaches the snapshot. + // The tracker pushes into the snapshot's ApplyStalled when given + // the snapshot in its ctor; the tracker also keeps its own latch, + // but the snapshot read surface is what the central UI reads. + using var tracker = new SiteAuditTelemetryStalledTracker(Sys, snapshot); + Sys.EventStream.Publish(new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + AwaitAssert(() => + { + var stalledMap = snapshot.SiteAuditTelemetryStalled; + Assert.True(stalledMap.TryGetValue("siteA", out var s) && s, + "expected siteA to be stalled in snapshot"); + }, + duration: TimeSpan.FromSeconds(2), + interval: TimeSpan.FromMilliseconds(20)); + + Assert.Equal(2, snapshot.CentralAuditWriteFailures); + Assert.Equal(1, snapshot.AuditRedactionFailure); + } + + [Fact] + public void Snapshot_Empty_OnConstruction() + { + // Sanity: the snapshot's three properties start at their zero values + // before any writer or stalled-event publication. + var snapshot = new AuditCentralHealthSnapshot(); + Assert.Equal(0, snapshot.CentralAuditWriteFailures); + Assert.Equal(0, snapshot.AuditRedactionFailure); + Assert.Empty(snapshot.SiteAuditTelemetryStalled); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs new file mode 100644 index 0000000..5cbcfe9 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs @@ -0,0 +1,442 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Messages.Integration; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle B (M6-T3) tests for . Most +/// tests substitute the with an in-memory +/// recording stub so the actor's tick / cursor / stalled state machinery can +/// be exercised in milliseconds without an MSSQL container. The duplicate / +/// idempotency assertion uses the real against +/// the so we verify InsertIfNotExistsAsync +/// actually swallows duplicate-key collisions (the M2 Bundle A race-fix the +/// reconciliation puller depends on). +/// +public class SiteAuditReconciliationActorTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public SiteAuditReconciliationActorTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private static AuditEvent NewEvent( + string siteId, + DateTime? occurredAt = null, + Guid? id = null) => new() + { + EventId = id ?? Guid.NewGuid(), + OccurredAtUtc = occurredAt ?? new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + + private static SiteAuditReconciliationOptions FastTickOptions( + int batchSize = 256, + int stalledAfter = 2) => + new() + { + // 100 ms tick keeps each test under a second. AwaitAssert covers + // schedule jitter so a 100 ms tick has up to ~3 s to fire. + ReconciliationIntervalSeconds = 300, + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + BatchSize = batchSize, + StalledAfterNonDrainingCycles = stalledAfter, + }; + + /// + /// In-memory recording stub used for non-MSSQL tests. Captures every + /// call AND deduplicates on + /// so duplicate-handling assertions don't + /// need a real database for the simple cases. + /// + private sealed class RecordingRepo : IAuditLogRepository + { + public List Inserted { get; } = new(); + private readonly HashSet _seen = new(); + public int InsertCallCount { get; private set; } + + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) + { + InsertCallCount++; + if (_seen.Add(evt.EventId)) + { + Inserted.Add(evt); + } + return Task.CompletedTask; + } + + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Inserted); + + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + Task.FromResult(0L); + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// + /// In-memory enumerator returning a static list of sites. + /// + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Scripted pull client — returns the next queued response for the site + /// on each call, looping the last entry if the queue is exhausted. Also + /// records every invocation so tests can assert call counts + arguments. + /// + private sealed class ScriptedPullClient : IPullAuditEventsClient + { + public List<(string SiteId, DateTime SinceUtc, int BatchSize)> Calls { get; } = new(); + private readonly Dictionary> _scripted = new(); + private readonly Dictionary _throwOnSite = new(); + + public ScriptedPullClient Script(string siteId, params PullAuditEventsResponse[] responses) + { + _scripted[siteId] = new Queue(responses); + return this; + } + + public ScriptedPullClient ThrowFor(string siteId, Exception ex) + { + _throwOnSite[siteId] = ex; + return this; + } + + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + Calls.Add((siteId, sinceUtc, batchSize)); + if (_throwOnSite.TryGetValue(siteId, out var ex)) + { + throw ex; + } + if (_scripted.TryGetValue(siteId, out var queue) && queue.Count > 0) + { + return Task.FromResult(queue.Dequeue()); + } + return Task.FromResult( + new PullAuditEventsResponse(Array.Empty(), MoreAvailable: false)); + } + } + + private IServiceProvider BuildScopedProvider(IAuditLogRepository repo) + { + var services = new ServiceCollection(); + // The actor opens a scope per tick and resolves IAuditLogRepository + // from that scope; registering as scoped mirrors how + // AddConfigurationDatabase wires the real repository. + services.AddScoped(_ => repo); + return services.BuildServiceProvider(); + } + + private IActorRef CreateActor( + ISiteEnumerator sites, + IPullAuditEventsClient client, + IAuditLogRepository repo, + SiteAuditReconciliationOptions options) + { + var sp = BuildScopedProvider(repo); + return Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + sites, + client, + sp, + Options.Create(options), + NullLogger.Instance))); + } + + /// + /// Subscribes to the EventStream and collects every + /// publication into a list + /// the test can assert on. Uses a probe actor so the stream's + /// fire-and-forget delivery is observable from the test thread. + /// + private (Akka.TestKit.TestProbe Probe, List Captured) SubscribeStalled() + { + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(SiteAuditTelemetryStalledChanged)); + var captured = new List(); + return (probe, captured); + } + + // --------------------------------------------------------------------- + // 1. Timer_Fires_OnConfiguredInterval + // --------------------------------------------------------------------- + + [Fact] + public void Timer_Fires_OnConfiguredInterval() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + var client = new ScriptedPullClient(); + var repo = new RecordingRepo(); + var opts = FastTickOptions(); + + CreateActor(sites, client, repo, opts); + + // The first scheduled tick fires after `ReconciliationIntervalSeconds`, + // which is 0 for the test — Akka's scheduler still respects the + // ScheduleTellRepeatedlyCancelable contract that issues a Tell on the + // scheduler thread, so we await visible side effects (a PullAsync call) + // rather than racing on internal state. + AwaitAssert( + () => Assert.True(client.Calls.Count >= 1, $"expected >= 1 pull call, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Tick_PullsFromEachKnownSite + // --------------------------------------------------------------------- + + [Fact] + public void Tick_PullsFromEachKnownSite() + { + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry("siteB", "http://siteB:8083")); + var client = new ScriptedPullClient(); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => + { + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.Contains(client.Calls, c => c.SiteId == "siteB"); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 3. Tick_IngestEvents_ViaInsertIfNotExistsAsync + // --------------------------------------------------------------------- + + [Fact] + public void Tick_IngestEvents_ViaInsertIfNotExistsAsync() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + var e1 = NewEvent("siteA"); + var e2 = NewEvent("siteA"); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(new[] { e1, e2 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => Assert.Equal(2, repo.InsertCallCount), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + Assert.Contains(repo.Inserted, e => e.EventId == e1.EventId); + Assert.Contains(repo.Inserted, e => e.EventId == e2.EventId); + } + + // --------------------------------------------------------------------- + // 4. Tick_Duplicates_NotDoubleInserted (real MSSQL idempotency) + // --------------------------------------------------------------------- + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + [SkippableFact] + public async Task Tick_Duplicates_NotDoubleInserted() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "bundle-b-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var pre = NewEvent(siteId); + + // Seed the row directly so the actor sees it already present when the + // pull returns it. + await using (var seedContext = CreateContext()) + { + await new AuditLogRepository(seedContext).InsertIfNotExistsAsync(pre); + } + + // Stack one new and the pre-existing row in the pull response. The + // second-pull script returns empty so the actor settles. + var fresh = NewEvent(siteId); + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://x:8083")); + var client = new ScriptedPullClient().Script(siteId, + new PullAuditEventsResponse(new[] { pre, fresh }, MoreAvailable: false)); + + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + CreateActor(sites, client, repo, FastTickOptions()); + + // Wait for the actor to ingest both rows. + await Task.Delay(TimeSpan.FromSeconds(1)); + AwaitAssert(() => Assert.True(client.Calls.Count >= 1), + duration: TimeSpan.FromSeconds(3)); + + // Even though the pull returned 2 events, only 1 fresh row should + // exist in MSSQL alongside the pre-existing one — InsertIfNotExistsAsync + // is first-write-wins on EventId. + await using var read = CreateContext(); + var rows = await read.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + Assert.Equal(2, rows.Count); + Assert.Contains(rows, r => r.EventId == pre.EventId); + Assert.Contains(rows, r => r.EventId == fresh.EventId); + } + + // --------------------------------------------------------------------- + // 5. Cursor_Advances_ToMaxOccurredAtUtc + // --------------------------------------------------------------------- + + [Fact] + public void Cursor_Advances_ToMaxOccurredAtUtc() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + var e1 = NewEvent("siteA", t1); + var e2 = NewEvent("siteA", t2); + var e3 = NewEvent("siteA", t3); + + // First pull returns three events with t1, t2, t3. Subsequent pulls + // return empty — but the test asserts the SECOND pull's since argument + // is t3 (the max OccurredAtUtc from the first pull). + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(new[] { e1, e2, e3 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + // Wait until we have at least two pulls — the second one must use t3 + // as its `since` argument because that was the max OccurredAtUtc in + // the first response. + AwaitAssert(() => Assert.True(client.Calls.Count >= 2, + $"need at least 2 pulls to assert cursor advancement, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(5), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Equal(DateTime.MinValue, client.Calls[0].SinceUtc); + Assert.Equal(t3, client.Calls[1].SinceUtc); + } + + // --------------------------------------------------------------------- + // 6. Tick_OneSiteThrows_OtherSitesStillProcessed + // --------------------------------------------------------------------- + + [Fact] + public void Tick_OneSiteThrows_OtherSitesStillProcessed() + { + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry("siteB", "http://siteB:8083")); + + var bEvent = NewEvent("siteB"); + var client = new ScriptedPullClient() + .ThrowFor("siteA", new InvalidOperationException("simulated transport failure")) + .Script("siteB", + new PullAuditEventsResponse(new[] { bEvent }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => + { + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.Contains(repo.Inserted, e => e.EventId == bEvent.EventId); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 7. StalledDetection_TwoConsecutiveNonDrainingCycles_PublishesStalledTrue + // --------------------------------------------------------------------- + + [Fact] + public void StalledDetection_TwoConsecutiveNonDrainingCycles_PublishesStalledTrue() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + // Two scripted responses that each return events AND MoreAvailable=true + // — the second pull triggers the stalled transition. + var batch1 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch2 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(batch1, MoreAvailable: true), + new PullAuditEventsResponse(batch2, MoreAvailable: true)); + + var repo = new RecordingRepo(); + var (probe, _) = SubscribeStalled(); + + CreateActor(sites, client, repo, FastTickOptions(stalledAfter: 2)); + + // Expect Stalled=true after the second non-draining tick. The probe + // waits with its own timeout (a few seconds gives the 0 s repeat + // interval ample slack). + var msg = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal("siteA", msg.SiteId); + Assert.True(msg.Stalled); + } + + // --------------------------------------------------------------------- + // 8. StalledDetection_DrainingCycle_PublishesStalledFalse + // --------------------------------------------------------------------- + + [Fact] + public void StalledDetection_DrainingCycle_PublishesStalledFalse() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + // Two non-draining responses get the actor into Stalled=true, then a + // draining response (events but MoreAvailable=false) flips it back. + var batch1 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch2 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch3 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(batch1, MoreAvailable: true), + new PullAuditEventsResponse(batch2, MoreAvailable: true), + new PullAuditEventsResponse(batch3, MoreAvailable: false)); + + var repo = new RecordingRepo(); + var (probe, _) = SubscribeStalled(); + + CreateActor(sites, client, repo, FastTickOptions(stalledAfter: 2)); + + // First publication is the stalled=true transition; second is the + // back-to-draining flip. The actor publishes ONLY on transitions so we + // expect exactly these two messages in order. + var first = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.True(first.Stalled); + + var second = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.False(second.Stalled); + Assert.Equal("siteA", second.SiteId); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs new file mode 100644 index 0000000..7c375a1 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs @@ -0,0 +1,116 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using ScadaLink.AuditLog.Central; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T7) tests for . +/// The tracker subscribes to the actor system's EventStream for +/// publications and maintains a +/// per-site latch the central health surface can read. Since reconciliation is +/// central-driven, the "stalled" state semantically belongs to central — not +/// to the per-site +/// payload (which the site itself emits). The tracker therefore lives as a +/// central singleton, not on the site health collector. +/// +public class SiteAuditTelemetryStalledTrackerTests : TestKit +{ + /// + /// Helper: publishes a stalled-changed event on the actor system's + /// EventStream and waits a moment for the tracker's subscribe callback to + /// run. AwaitAssert avoids racing on the stream's async fan-out. + /// + private void PublishAndWait(SiteAuditTelemetryStalledTracker tracker, SiteAuditTelemetryStalledChanged evt) + { + Sys.EventStream.Publish(evt); + AwaitAssert( + () => + { + var snapshot = tracker.Snapshot(); + Assert.True(snapshot.TryGetValue(evt.SiteId, out var stalled), + $"tracker did not record event for {evt.SiteId}"); + Assert.Equal(evt.Stalled, stalled); + }, + duration: TimeSpan.FromSeconds(2), + interval: TimeSpan.FromMilliseconds(20)); + } + + [Fact] + public void Initial_Snapshot_IsEmpty() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + var snapshot = tracker.Snapshot(); + + Assert.Empty(snapshot); + } + + [Fact] + public void StalledTrue_Event_TrackerReports_Stalled() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + + var snapshot = tracker.Snapshot(); + Assert.True(snapshot["siteA"]); + } + + [Fact] + public void StalledFalse_Event_TrackerReports_NotStalled() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + // First flip the site into stalled so the false transition has a + // prior value to overwrite — mirrors how the reconciliation actor + // only publishes false after a true. + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: false)); + + var snapshot = tracker.Snapshot(); + Assert.False(snapshot["siteA"]); + } + + [Fact] + public void Multiple_Sites_Tracked_Independently() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteB", Stalled: false)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteC", Stalled: true)); + + var snapshot = tracker.Snapshot(); + Assert.Equal(3, snapshot.Count); + Assert.True(snapshot["siteA"]); + Assert.False(snapshot["siteB"]); + Assert.True(snapshot["siteC"]); + } + + [Fact] + public void Constructor_With_Null_ActorSystem_Throws() + { + Assert.Throws( + () => new SiteAuditTelemetryStalledTracker((ActorSystem)null!)); + } + + [Fact] + public void Dispose_Unsubscribes_From_EventStream() + { + var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + + tracker.Dispose(); + + // After dispose any further events are ignored — the snapshot + // reflects the last known state at dispose time. + Sys.EventStream.Publish(new SiteAuditTelemetryStalledChanged("siteA", Stalled: false)); + + // Give the stream a moment in case the unsubscribe is racey; the + // assertion is that siteA stays at true. + Thread.Sleep(50); + Assert.True(tracker.Snapshot()["siteA"]); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs new file mode 100644 index 0000000..57295be --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs @@ -0,0 +1,349 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Site; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Messages.Integration; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T10) end-to-end test for the central-outage + reconciliation +/// recovery loop. Wires the real site SQLite hot-path +/// () and the central +/// with an backed by the real +/// on the per-test . +/// +/// +/// +/// The push path is deliberately omitted here: the brief models a sustained +/// central outage where the site queue grows unbounded in Pending, then a +/// reconciliation pull eventually drains everything once central comes back. +/// We reuse the production seam (Bundle B) +/// with a test-only stub that wraps the same +/// surface a real central-side gRPC client would hit, so the test is exercising +/// the actor's pull/ingest/mark-reconciled state machine end-to-end against +/// the real repository. +/// +/// +/// The from M3 is push-only — it has no +/// reconciliation puller — so we build the smaller stub inline rather than +/// retrofitting the shared harness with a code path it doesn't otherwise +/// need. +/// +/// +public class OutageReconciliationTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public OutageReconciliationTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + /// + /// Test-only that mirrors how the + /// production central-side gRPC client will hit the site: read a batch + /// from , then commit + /// via once the central + /// repository accepts the rows. The Ask-based central path is wired by + /// the caller — we just expose the queue surface. + /// + /// + /// The production wire shape will be: + /// central PullAuditEvents RPC → site SiteStreamGrpcServer.PullAuditEvents + /// → ISiteAuditQueue.ReadPendingSinceAsync → marshal proto → reply + /// followed by central InsertIfNotExistsAsync per row, then the site flips + /// the row to Reconciled on the next pull cycle. The stub collapses the + /// two halves (pull + commit) because the actor under test (the + /// reconciliation actor) is the side that drives both via the + /// IPullAuditEventsClient seam — committing back to the site after the + /// repository write is the reconciliation-actor invariant we want to + /// observe end-to-end. + /// + private sealed class QueueBackedPullClient : IPullAuditEventsClient + { + private readonly ISiteAuditQueue _siteQueue; + public int CallCount { get; private set; } + + public QueueBackedPullClient(ISiteAuditQueue siteQueue) + { + _siteQueue = siteQueue ?? throw new ArgumentNullException(nameof(siteQueue)); + } + + public async Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + CallCount++; + + var rows = await _siteQueue + .ReadPendingSinceAsync(sinceUtc, batchSize, ct) + .ConfigureAwait(false); + + // Commit immediately on the site side — once the actor has the + // batch in hand it will InsertIfNotExistsAsync centrally; if the + // central insert later throws on a specific row, idempotency + // guarantees the next pull cycle does NOT re-fetch the row (it's + // already Reconciled on the site) but also does not surface the + // failure here. The brief calls this "ack-after-persist" — the + // production gRPC server will flip to Reconciled inside its + // PullAuditEvents handler after the central side has acknowledged + // (per Bundle A's race-fix, central is idempotent on EventId). + // + // MoreAvailable is true iff the read filled the batch — the actor + // uses this to decide whether to follow up on the next tick. + if (rows.Count > 0) + { + var ids = rows.Select(e => e.EventId).ToList(); + await _siteQueue.MarkReconciledAsync(ids, ct).ConfigureAwait(false); + } + + return new PullAuditEventsResponse(rows, MoreAvailable: rows.Count >= batchSize); + } + } + + /// + /// In-memory enumerator returning a fixed single-site list — mirrors the + /// pattern used in SiteAuditReconciliationActorTests. + /// + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + private static AuditEvent NewEvent(string siteId, DateTime occurredAt) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAt, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + Target = "external-system-a/method", + }; + + private SqliteAuditWriter CreateInMemorySqliteWriter() => + new SqliteAuditWriter( + Options.Create(new SqliteAuditWriterOptions + { + DatabasePath = "ignored", + BatchSize = 64, + ChannelCapacity = 4096, + }), + NullLogger.Instance, + connectionStringOverride: + $"Data Source=file:outage-{Guid.NewGuid():N}?mode=memory&cache=shared"); + + private (IServiceProvider Sp, IActorRef Ingest) BuildCentralPipeline() + { + var services = new ServiceCollection(); + services.AddDbContext(opts => + opts.UseSqlServer(_fixture.ConnectionString)); + services.AddScoped(sp => + new AuditLogRepository(sp.GetRequiredService())); + var sp = services.BuildServiceProvider(); + + var ingest = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor( + sp, + NullLogger.Instance))); + return (sp, ingest); + } + + private static SiteAuditReconciliationOptions FastTickOptions(int batchSize = 256) => new() + { + ReconciliationIntervalSeconds = 300, + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + BatchSize = batchSize, + StalledAfterNonDrainingCycles = 2, + }; + + // --------------------------------------------------------------------- + // 1. CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "outage-recon-" + Guid.NewGuid().ToString("N").Substring(0, 8); + + // Step 1: site accumulates 200 audit events during the simulated + // central outage. The push path is NOT wired here — every row stays + // Pending in the site SQLite store until reconciliation runs. + await using var sqliteWriter = CreateInMemorySqliteWriter(); + var baseOccurred = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + const int totalEvents = 200; + var written = new List(totalEvents); + + for (int i = 0; i < totalEvents; i++) + { + // Strictly monotonic OccurredAtUtc so the cursor can advance + // deterministically batch-by-batch — mirrors how a real script + // workload generates timestamps in wall-clock order. + var evt = NewEvent(siteId, baseOccurred.AddMilliseconds(i)); + written.Add(evt); + await sqliteWriter.WriteAsync(evt); + } + + // Sanity: every row is Pending (no push path wired, so nothing has + // been Forwarded or Reconciled yet). + var pending = await sqliteWriter.ReadPendingAsync(totalEvents + 10); + Assert.Equal(totalEvents, pending.Count); + + // Step 2: central comes online — wire the ingest actor + reconciliation + // actor. The pull client wraps the site queue directly (the production + // shape is one RPC call); each pull advances the actor's cursor and + // flips rows on the site to Reconciled. + var (sp, ingest) = BuildCentralPipeline(); + await using (sp as IAsyncDisposable ?? throw new InvalidOperationException()) + { + var pullClient = new QueueBackedPullClient(sqliteWriter); + var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083")); + + // BatchSize = 64 so the actor needs ~4 ticks to drain 200 rows. + // The "after 5 minutes" wording in the brief is satisfied by the + // fast-tick override (100 ms per tick) plus AwaitAssert giving + // the actor up to ~30 seconds to settle in real time. + var opts = FastTickOptions(batchSize: 64); + + // Standalone DI scope for the reconciliation actor (it shares the + // ingest actor's IServiceProvider so both writers see the same + // EF context configuration). + var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + enumerator, + pullClient, + sp, + Options.Create(opts), + NullLogger.Instance))); + + // Step 3: assert central AuditLog has all 200 rows after the + // actor drains. Polling the real MSSQL repository — the test + // fixture has its own database so a count restricted to this + // SourceSiteId is exact. + await AwaitAssertAsync(async () => + { + await using var ctx = CreateContext(); + var count = await ctx.Set() + .Where(e => e.SourceSiteId == siteId) + .CountAsync(); + Assert.Equal(totalEvents, count); + }, + duration: TimeSpan.FromSeconds(30), + interval: TimeSpan.FromMilliseconds(200)); + + // Step 4: assert site rows flipped to Reconciled. + // ReadPendingAsync only returns Pending rows; after a full drain + // it must be empty. + await AwaitAssertAsync(async () => + { + var stillPending = await sqliteWriter.ReadPendingAsync(totalEvents + 10); + Assert.Empty(stillPending); + }, + duration: TimeSpan.FromSeconds(10), + interval: TimeSpan.FromMilliseconds(100)); + + // Step 5: assert no duplicates by EventId — central must have + // exactly the 200 rows we wrote at the site (one row per EventId). + await using var verify = CreateContext(); + var centralIds = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .Select(e => e.EventId) + .ToListAsync(); + Assert.Equal(totalEvents, centralIds.Count); + Assert.Equal(totalEvents, centralIds.Distinct().Count()); + // And every EventId we wrote at the site is present centrally. + Assert.True(written.All(w => centralIds.Contains(w.EventId)), + "every site-written EventId should be present centrally."); + + // Tear the actor down before disposing the harness; the actor's + // PostStop cancels its scheduled timer. + Sys.Stop(reconciliationActor); + } + } + + // --------------------------------------------------------------------- + // 2. ReconciliationPull_Idempotent_Across_Two_Cycles + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task ReconciliationPull_Idempotent_Across_Two_Cycles() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "outage-idem-" + Guid.NewGuid().ToString("N").Substring(0, 8); + const int totalEvents = 50; + + await using var sqliteWriter = CreateInMemorySqliteWriter(); + var baseOccurred = new DateTime(2026, 5, 20, 13, 0, 0, DateTimeKind.Utc); + for (int i = 0; i < totalEvents; i++) + { + await sqliteWriter.WriteAsync(NewEvent(siteId, baseOccurred.AddMilliseconds(i))); + } + + var (sp, _) = BuildCentralPipeline(); + await using (sp as IAsyncDisposable ?? throw new InvalidOperationException()) + { + var pullClient = new QueueBackedPullClient(sqliteWriter); + var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083")); + + var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + enumerator, + pullClient, + sp, + Options.Create(FastTickOptions()), + NullLogger.Instance))); + + // Wait for the first drain cycle to complete. + await AwaitAssertAsync(async () => + { + await using var ctx = CreateContext(); + var count = await ctx.Set() + .Where(e => e.SourceSiteId == siteId) + .CountAsync(); + Assert.Equal(totalEvents, count); + }, + duration: TimeSpan.FromSeconds(30), + interval: TimeSpan.FromMilliseconds(200)); + + // Wait for additional pull cycles to fire — the actor ticks every + // 100 ms so a 1 s settle leaves the actor with at least ~5 ticks + // past the initial drain. Each subsequent tick must be a no-op + // because every row is now Reconciled and outside the + // ReadPendingSinceAsync filter. + var callsAfterDrain = pullClient.CallCount; + await Task.Delay(TimeSpan.FromMilliseconds(800)); + Assert.True(pullClient.CallCount > callsAfterDrain, + $"expected additional pull calls after drain to validate idempotency, got {pullClient.CallCount} after {callsAfterDrain}"); + + // Central count must still be exactly totalEvents — no duplicates + // even though the cursor + read-Reconciled-too semantics could + // theoretically re-fetch on the second cycle. + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + Assert.Equal(totalEvents, rows.Count); + Assert.Equal(totalEvents, rows.Select(r => r.EventId).Distinct().Count()); + + Sys.Stop(reconciliationActor); + } + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs new file mode 100644 index 0000000..bd1a81c --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs @@ -0,0 +1,278 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Interfaces; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Maintenance; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T12) end-to-end tests for the +/// hosted service running +/// the real EF/MSSQL against the +/// per-class . The migration seeds +/// boundaries for every month Jan 2026 – Dec 2027, so the eager startup tick +/// can be exercised both for the "future covered" no-op case and for the +/// "lookahead larger than covered" SPLIT case. +/// +/// +/// Tests within this class share one fixture DB — boundaries added by one +/// test persist across the next. Each test reads the max boundary at the +/// start and computes its lookahead relative to it, mirroring the pattern +/// used by the per-component AuditLogPartitionMaintenanceTests in +/// ScadaLink.ConfigurationDatabase.Tests. +/// +public class PartitionMaintenanceTests : IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public PartitionMaintenanceTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + /// + /// Builds the central-side DI graph for the hosted service: scoped EF + /// context + scoped matching how + /// AddConfigurationDatabase wires the production composition root. + /// + private ServiceProvider BuildProvider() + { + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + return services.BuildServiceProvider(); + } + + private static async Task ReadMaxBoundaryAsync(IServiceProvider sp) + { + await using var scope = sp.CreateAsyncScope(); + var maintenance = scope.ServiceProvider.GetRequiredService(); + return await maintenance.GetMaxBoundaryAsync(); + } + + /// + /// Mirrors the helper in + /// AuditLogPartitionMaintenanceTests.LookaheadForExtraBoundaries: + /// the smallest lookahead value that lands the SPLIT horizon exactly + /// months past the current max. + /// + private static int LookaheadForExtraBoundaries(DateTime max, int extraBoundaries) + { + var nowFirstOfNextMonth = FirstOfNextMonth(DateTime.UtcNow); + var monthsToMax = ((max.Year - nowFirstOfNextMonth.Year) * 12) + + max.Month - nowFirstOfNextMonth.Month; + return monthsToMax + extraBoundaries; + } + + private static int LookaheadInsideExistingRange(DateTime max) + { + var now = DateTime.UtcNow; + var months = ((max.Year - now.Year) * 12) + max.Month - now.Month - 1; + return Math.Max(1, months); + } + + private static DateTime FirstOfNextMonth(DateTime instant) + { + var firstOfThisMonth = new DateTime(instant.Year, instant.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return firstOfThisMonth.AddMonths(1); + } + + /// + /// Awaits one full tick of the hosted service. The service runs an + /// eager startup tick inside 's + /// continuation, but the continuation is dispatched on a background + /// Task.Run — so we poll the side effect (the boundary count or + /// max-boundary value) until it changes. + /// + private async Task StartAndAwaitStartupTickAsync( + AuditLogPartitionMaintenanceService svc, + Func> awaitCondition, + TimeSpan timeout) + { + await svc.StartAsync(CancellationToken.None); + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline) + { + if (await awaitCondition()) + { + return; + } + await Task.Delay(50); + } + } + + // --------------------------------------------------------------------- + // 1. EndToEnd_DefaultLookahead_NoSplit_WhenFutureCovered + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_DefaultLookahead_NoSplit_WhenFutureCovered() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + // The migration seeds boundaries through Dec 2027. With default + // lookahead = 1 and today = ~2026-05-20, horizon = + // NormalizeToFirstOfMonth(now) + 1 = 2026-07-01, well within the + // seeded range, so the startup tick should issue zero SPLITs. + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Skip if the fixture DB already has boundaries past Dec 2027 from + // a prior test in this class — the lookahead-already-covered path + // is what we want to exercise, regardless of how far past Dec 2027 + // the boundary may be. + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, // long enough that only the startup tick fires inside the test window + LookaheadMonths = 1, + }); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + // Drive the startup tick. There is no public completion handle; + // poll until either (a) the max boundary changes (which would be a + // failure for this test) or (b) the polling window expires (success). + await svc.StartAsync(CancellationToken.None); + await Task.Delay(TimeSpan.FromSeconds(2)); + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + // Assert the max boundary is unchanged: no SPLIT was issued. + var maxAfter = await ReadMaxBoundaryAsync(sp); + Assert.Equal(maxBefore, maxAfter); + } + + // --------------------------------------------------------------------- + // 2. EndToEnd_LookaheadLargerThanCovered_Splits_NewBoundaries + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_LookaheadLargerThanCovered_Splits_NewBoundaries() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Pick a lookahead that adds exactly two new boundaries past the + // current max. The expected new boundaries are max+1mo and max+2mo. + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 2); + var expectedFirstNew = maxBefore.Value.AddMonths(1); + var expectedSecondNew = maxBefore.Value.AddMonths(2); + + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = lookahead, + }); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + // Drive the startup tick. Wait until max boundary moves forward by + // the expected amount; SPLIT against MSSQL can take a second or two + // on a busy dev container. + await StartAndAwaitStartupTickAsync( + svc, + async () => + { + var current = await ReadMaxBoundaryAsync(sp); + return current == expectedSecondNew; + }, + timeout: TimeSpan.FromSeconds(15)); + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + var maxAfter = await ReadMaxBoundaryAsync(sp); + // Two new boundaries should be present after the startup tick. The + // hosted service does not surface the added-list directly (it logs + // only at Information), so we assert via the max-boundary delta. + Assert.Equal(expectedSecondNew, maxAfter); + // Sanity: the intermediate boundary was also added (the loop + // SPLITs every month from max+1 up to horizon, in order). + Assert.True(expectedFirstNew < expectedSecondNew); + } + + // --------------------------------------------------------------------- + // 3. EndToEnd_PartitionMaintenance_Idempotent_OverTwoRuns + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_PartitionMaintenance_Idempotent_OverTwoRuns() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Add exactly one new boundary on the first run. + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 1); + var expectedAdded = maxBefore.Value.AddMonths(1); + + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = lookahead, + }); + + // First run. + var svc1 = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + await StartAndAwaitStartupTickAsync( + svc1, + async () => + { + var current = await ReadMaxBoundaryAsync(sp); + return current == expectedAdded; + }, + timeout: TimeSpan.FromSeconds(15)); + await svc1.StopAsync(CancellationToken.None); + svc1.Dispose(); + + var maxAfterFirst = await ReadMaxBoundaryAsync(sp); + Assert.Equal(expectedAdded, maxAfterFirst); + + // Second run with the SAME lookahead value. Because the boundary + // is already covered, the EnsureLookaheadAsync call must be a + // no-op — max boundary is unchanged AND no exception is thrown. + var svc2 = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + await svc2.StartAsync(CancellationToken.None); + // Wait long enough that the startup tick would have fired and + // logged any boundary addition; the boundary state must remain + // unchanged after the wait. + await Task.Delay(TimeSpan.FromSeconds(2)); + await svc2.StopAsync(CancellationToken.None); + svc2.Dispose(); + + var maxAfterSecond = await ReadMaxBoundaryAsync(sp); + Assert.Equal(maxAfterFirst, maxAfterSecond); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs new file mode 100644 index 0000000..69db1b1 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs @@ -0,0 +1,354 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Data.SqlClient; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T11) end-to-end test for the daily partition-switch +/// purge: seeds three monthly partitions (Jan / Feb / Mar 2026) with direct +/// INSERTs that bypass the standard repository ingest path (so the seed +/// timestamps are explicit), drives against +/// the real + per-test +/// database, and asserts: +/// +/// The oldest partition (Jan) is removed. +/// Newer partitions (Feb + Mar) are untouched. +/// The UX_AuditLog_EventId unique index survives the +/// drop-and-rebuild dance. +/// remains +/// idempotent against the rebuilt index after the purge. +/// +/// +/// +/// The brief calls out that direct INSERTs bypass the writer role's INSERT-only +/// grant; the fixture connects as sa (see +/// 's default admin connection string), so +/// the seed step does not need the writer role at all. The drop-and-rebuild +/// dance itself runs under the same admin connection because the test owns +/// the database — the role granularity is exercised in the repository tests, +/// not here. +/// +public class PartitionPurgeTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public PartitionPurgeTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + /// + /// Direct INSERT into dbo.AuditLog bypassing + /// . Used by the + /// seed step so the test can place rows in arbitrary partitions without + /// the repository's idempotency wrapper or ingest-stamping behaviour + /// affecting the seed payload. + /// + private async Task DirectInsertAsync( + SqlConnection conn, + Guid eventId, + DateTime occurredAtUtc, + string siteId) + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = @" +INSERT INTO dbo.AuditLog + (EventId, OccurredAtUtc, IngestedAtUtc, Channel, Kind, CorrelationId, + SourceSiteId, SourceInstanceId, SourceScript, Actor, Target, Status, + HttpStatus, DurationMs, ErrorMessage, ErrorDetail, RequestSummary, + ResponseSummary, PayloadTruncated, Extra, ForwardState) +VALUES + (@EventId, @OccurredAtUtc, @IngestedAtUtc, 'ApiOutbound', 'ApiCall', NULL, + @SourceSiteId, NULL, NULL, NULL, NULL, 'Delivered', + NULL, NULL, NULL, NULL, NULL, + NULL, 0, NULL, NULL);"; + cmd.Parameters.Add("@EventId", System.Data.SqlDbType.UniqueIdentifier).Value = eventId; + // SqlDbType.DateTime2 with explicit Scale 7 matches the + // OccurredAtUtc column shape (datetime2(7)) and avoids the implicit + // narrowing that SqlClient's default DateTime → datetime applies via + // AddWithValue. Critical for partition assignment: the partition + // function key column is datetime2(7); a narrowed value would still + // land in the correct partition for first-of-month seeds, but + // explicit typing here documents the intent and matches how the + // production repository INSERT shapes its parameters. + var occurredParam = cmd.Parameters.Add("@OccurredAtUtc", System.Data.SqlDbType.DateTime2); + occurredParam.Scale = 7; + occurredParam.Value = occurredAtUtc; + var ingestedParam = cmd.Parameters.Add("@IngestedAtUtc", System.Data.SqlDbType.DateTime2); + ingestedParam.Scale = 7; + ingestedParam.Value = DateTime.UtcNow; + cmd.Parameters.Add("@SourceSiteId", System.Data.SqlDbType.VarChar, 64).Value = siteId; + await cmd.ExecuteNonQueryAsync(); + } + + /// + /// Asserts that UX_AuditLog_EventId exists in + /// sys.indexes. The drop-and-rebuild dance briefly removes the + /// index inside its transaction; this check is meant to fire AFTER the + /// actor's purge tick has committed so the rebuilt index is observable. + /// + private static async Task AssertUxIndexExistsAsync(SqlConnection conn) + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = @" +SELECT COUNT(*) +FROM sys.indexes +WHERE name = 'UX_AuditLog_EventId' + AND object_id = OBJECT_ID('dbo.AuditLog');"; + var raw = await cmd.ExecuteScalarAsync(); + var count = Convert.ToInt32(raw); + Assert.True(count == 1, $"UX_AuditLog_EventId should be present post-purge; sys.indexes count was {count}."); + } + + private IActorRef CreateActor( + IServiceProvider sp, + AuditLogPurgeOptions purgeOptions, + AuditLogOptions auditOptions) + { + return Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions), + NullLogger.Instance))); + } + + private static (DateTime Jan, DateTime Feb, DateTime Mar) SeedOccurredAt() => ( + new DateTime(2026, 1, 15, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 2, 15, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 3, 15, 0, 0, 0, DateTimeKind.Utc)); + + // --------------------------------------------------------------------- + // 1. EndToEnd_OldestPartition_PurgedViaActor_NewerKept + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_OldestPartition_PurgedViaActor_NewerKept() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Test date is ~2026-05-20 per environment. We want a threshold that + // sits strictly between Jan 15 (the Jan partition's MAX) and Feb 15 + // (the Feb partition's MAX) so only the Jan-2026 partition is + // eligible for purge. RetentionDays = 100 gives a threshold of + // ~2026-02-09 — Jan 15 is older (purged), Feb 15 and Mar 15 are + // newer (kept). The window between Jan 15 and Feb 15 is wide enough + // (~30 days) to tolerate any plausible test-clock drift in CI. + var siteId = "purge-e2e-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var febEventId = Guid.NewGuid(); + var marEventId = Guid.NewGuid(); + var (janOccurred, febOccurred, marOccurred) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + await DirectInsertAsync(seedConn, febEventId, febOccurred, siteId); + await DirectInsertAsync(seedConn, marEventId, marOccurred, siteId); + } + + // Wire the actor with a real EF context against the fixture DB. + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + var purgeOptions = new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }; + var auditOptions = new AuditLogOptions { RetentionDays = 100 }; + + CreateActor(sp, purgeOptions, auditOptions); + + // Wait for the actor's tick to purge the Jan-2026 partition. + // Concurrent test runs against the same fixture might also create + // eligible partitions, but each test class owns its own fixture DB + // (MsSqlMigrationFixture seeds a guid-named DB per class), so the + // Jan-2026 boundary is the only one this test can have produced. + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + var matched = probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + Assert.True(matched.RowsDeleted >= 1, + $"Expected RowsDeleted >= 1 for Jan-2026 boundary; got {matched.RowsDeleted}."); + + // Allow a brief settle in case the actor is mid-tick on Feb/Mar + // (it shouldn't be, since RetentionDays = 90 means only Jan is + // eligible, but the actor MAY re-enumerate quickly while we read). + await Task.Delay(TimeSpan.FromMilliseconds(500)); + + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + // Jan removed; Feb + Mar untouched. Because the test owns the site + // id and the fixture DB, exact set membership is observable. + Assert.DoesNotContain(rows, r => r.EventId == janEventId); + Assert.Contains(rows, r => r.EventId == febEventId); + Assert.Contains(rows, r => r.EventId == marEventId); + } + + // --------------------------------------------------------------------- + // 2. EndToEnd_UxIndexRebuilt_AfterPurge + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_UxIndexRebuilt_AfterPurge() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Same shape as test 1 — purge the Jan-2026 partition and then + // assert the UX_AuditLog_EventId index is still present. The + // drop-and-rebuild dance briefly removes it inside its transaction + // (the SWITCH PARTITION step requires the non-aligned unique index + // to be absent), but step 5 rebuilds it before committing. Sanity- + // checking the post-COMMIT shape here documents the invariant in an + // assertable way. + var siteId = "purge-uxidx-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var (janOccurred, _, _) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + } + + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + CreateActor( + sp, + new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }, + new AuditLogOptions { RetentionDays = 90 }); + + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + // Open a fresh connection (the actor's pool is owned by EF) and + // assert the index is present post-purge. + await using var check = _fixture.OpenConnection(); + await AssertUxIndexExistsAsync(check); + } + + // --------------------------------------------------------------------- + // 3. EndToEnd_InsertIfNotExistsAsync_StillIdempotent_AfterPurge + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_InsertIfNotExistsAsync_StillIdempotent_AfterPurge() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Seed + purge a Jan-2026 row, THEN exercise InsertIfNotExistsAsync + // twice for a fresh (May-2026) EventId. The second call must be a + // no-op (duplicate-key collision swallowed by the repository, per + // M2 Bundle A's race-fix) — which means the rebuilt + // UX_AuditLog_EventId unique index is functioning as intended. + var siteId = "purge-idem-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var (janOccurred, _, _) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + } + + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + CreateActor( + sp, + new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }, + new AuditLogOptions { RetentionDays = 90 }); + + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + // Settle then exercise InsertIfNotExistsAsync twice for the same + // EventId. The repository's idempotency relies on + // UX_AuditLog_EventId being present so the IF NOT EXISTS … INSERT + // race window resolves to a duplicate-key violation the repo + // swallows. If the index were missing here, two rows would land + // and the second InsertIfNotExistsAsync would silently double-insert. + await Task.Delay(TimeSpan.FromMilliseconds(500)); + + var freshEventId = Guid.NewGuid(); + var freshOccurred = new DateTime(2026, 5, 15, 12, 0, 0, DateTimeKind.Utc); + var freshSite = "purge-idem-fresh-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var freshEvt = new AuditEvent + { + EventId = freshEventId, + OccurredAtUtc = freshOccurred, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = freshSite, + Target = "system-x/method", + }; + + await using (var ctx = CreateContext()) + { + var repo = new AuditLogRepository(ctx); + await repo.InsertIfNotExistsAsync(freshEvt); + // Same row a second time — must be a silent no-op. + await repo.InsertIfNotExistsAsync(freshEvt); + } + + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == freshSite) + .ToListAsync(); + Assert.Single(rows); + Assert.Equal(freshEventId, rows[0].EventId); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs index a0c5c85..3b55da3 100644 --- a/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs @@ -9,6 +9,7 @@ using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.AuditLog.Tests.Integration.Infrastructure; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types.Audit; using ScadaLink.Commons.Types.Enums; using ScadaLink.ConfigurationDatabase; diff --git a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs new file mode 100644 index 0000000..95f9570 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs @@ -0,0 +1,136 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Site; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Types.Enums; + +namespace ScadaLink.AuditLog.Tests.Site; + +/// +/// Bundle E (M6-T6) tests for . +/// Exercises the health-metric surface that SiteAuditBacklogReporter +/// polls every 30 s and pushes onto the site health report as +/// SiteAuditBacklog. +/// +public class SqliteAuditWriterBacklogStatsTests : IDisposable +{ + private readonly string _dbPath; + + public SqliteAuditWriterBacklogStatsTests() + { + // OnDiskBytes assertions only make sense against a real file — the + // shared-cache in-memory mode returns 0 for the file size, so this + // suite is opinionated about file-backed storage. Tests in + // SqliteAuditWriterWriteTests use in-memory for performance reasons. + _dbPath = Path.Combine(Path.GetTempPath(), + $"audit-backlog-stats-{Guid.NewGuid():N}.db"); + } + + public void Dispose() + { + if (File.Exists(_dbPath)) + { + try { File.Delete(_dbPath); } catch { /* test cleanup best-effort */ } + } + } + + private SqliteAuditWriter CreateWriter() + { + var options = new SqliteAuditWriterOptions { DatabasePath = _dbPath }; + return new SqliteAuditWriter( + Options.Create(options), + NullLogger.Instance); + } + + private static AuditEvent NewEvent(DateTime? occurredAtUtc = null) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAtUtc ?? DateTime.UtcNow, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + PayloadTruncated = false, + }; + + [Fact] + public async Task EmptyDb_Returns_Zero_Null_AndZeroBytes() + { + // No file exists yet — the writer ctor creates one but no rows are + // inserted; the snapshot should report a clean queue. OnDiskBytes is + // allowed to be zero (fresh ftruncate) OR small (page header) — the + // contract only requires non-negative; we assert >= 0 and exercise + // the pending fields strictly. + await using var writer = CreateWriter(); + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(0, snapshot.PendingCount); + Assert.Null(snapshot.OldestPendingUtc); + Assert.True(snapshot.OnDiskBytes >= 0, + $"OnDiskBytes must be non-negative, got {snapshot.OnDiskBytes}"); + } + + [Fact] + public async Task Pending_5_Returns_5() + { + await using var writer = CreateWriter(); + + for (var i = 0; i < 5; i++) + { + await writer.WriteAsync(NewEvent()); + } + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(5, snapshot.PendingCount); + } + + [Fact] + public async Task OldestPending_Is_Earliest_OccurredAtUtc() + { + await using var writer = CreateWriter(); + + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + + // Insert out of order so the snapshot is not "the last write" by + // accident — the OldestPendingUtc must come from a column-min, not + // an insertion-order proxy. + await writer.WriteAsync(NewEvent(t2)); + await writer.WriteAsync(NewEvent(t1)); + await writer.WriteAsync(NewEvent(t3)); + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(3, snapshot.PendingCount); + Assert.NotNull(snapshot.OldestPendingUtc); + // The DB round-trips OccurredAtUtc through the "o" format which + // preserves Kind=Utc — assert tick-equality. + Assert.Equal(t1, snapshot.OldestPendingUtc!.Value); + } + + [Fact] + public async Task OnDiskBytes_ReturnsFileSize() + { + await using var writer = CreateWriter(); + + // Insert enough rows to grow the file past the empty schema baseline. + for (var i = 0; i < 100; i++) + { + await writer.WriteAsync(NewEvent()); + } + + var snapshot = await writer.GetBacklogStatsAsync(); + + // The exact size depends on SQLite page allocation, but a file-backed + // db with 100 inserted rows MUST be larger than the empty schema + // (a few pages, ~4 KB). The implementation should return the + // FileInfo.Length value verbatim. + Assert.True(File.Exists(_dbPath), $"DB file should exist at {_dbPath}"); + var expected = new FileInfo(_dbPath).Length; + Assert.Equal(expected, snapshot.OnDiskBytes); + Assert.True(snapshot.OnDiskBytes > 0, + $"after 100 inserts OnDiskBytes must be > 0, got {snapshot.OnDiskBytes}"); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs index b490142..f9fe5c4 100644 --- a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs @@ -204,4 +204,153 @@ public class SqliteAuditWriterWriteTests await writer.MarkForwardedAsync(phantomIds); // No assertion needed: the call must complete without throwing. } + + // ----- M6 reconciliation pull surface ----- // + + [Fact] + public async Task ReadPendingSinceAsync_Returns_PendingAndForwarded_OldestFirst_LimitedToN() + { + var (writer, dataSource) = CreateWriter(nameof(ReadPendingSinceAsync_Returns_PendingAndForwarded_OldestFirst_LimitedToN)); + await using var _ = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var evts = new[] + { + NewEvent(occurredAtUtc: baseTime.AddSeconds(5)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(1)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(3)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(2)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(4)), + }; + foreach (var e in evts) await writer.WriteAsync(e); + + // Flip half to Forwarded — they must still surface in the reconciliation pull + // because central hasn't confirmed they were ingested yet. + await writer.MarkForwardedAsync(new[] { evts[0].EventId, evts[2].EventId }); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: DateTime.MinValue, batchSize: 3); + + Assert.Equal(3, rows.Count); + Assert.Equal(baseTime.AddSeconds(1), rows[0].OccurredAtUtc); + Assert.Equal(baseTime.AddSeconds(2), rows[1].OccurredAtUtc); + Assert.Equal(baseTime.AddSeconds(3), rows[2].OccurredAtUtc); + } + + [Fact] + public async Task ReadPendingSinceAsync_ExcludesRowsOlderThanSinceUtc() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_ExcludesRowsOlderThanSinceUtc)); + await using var _w = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var old = NewEvent(occurredAtUtc: baseTime.AddSeconds(-30)); + var newer1 = NewEvent(occurredAtUtc: baseTime.AddSeconds(10)); + var newer2 = NewEvent(occurredAtUtc: baseTime.AddSeconds(20)); + + await writer.WriteAsync(old); + await writer.WriteAsync(newer1); + await writer.WriteAsync(newer2); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: baseTime, batchSize: 10); + + Assert.Equal(2, rows.Count); + Assert.Contains(rows, r => r.EventId == newer1.EventId); + Assert.Contains(rows, r => r.EventId == newer2.EventId); + Assert.DoesNotContain(rows, r => r.EventId == old.EventId); + } + + [Fact] + public async Task ReadPendingSinceAsync_ExcludesReconciledRows() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_ExcludesReconciledRows)); + await using var _w = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var pending = NewEvent(occurredAtUtc: baseTime); + var reconciled = NewEvent(occurredAtUtc: baseTime.AddSeconds(1)); + + await writer.WriteAsync(pending); + await writer.WriteAsync(reconciled); + await writer.MarkReconciledAsync(new[] { reconciled.EventId }); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: DateTime.MinValue, batchSize: 10); + + Assert.Single(rows); + Assert.Equal(pending.EventId, rows[0].EventId); + } + + [Fact] + public async Task ReadPendingSinceAsync_InvalidBatchSize_Throws() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_InvalidBatchSize_Throws)); + await using var _w = writer; + + await Assert.ThrowsAsync( + () => writer.ReadPendingSinceAsync(DateTime.MinValue, batchSize: 0)); + await Assert.ThrowsAsync( + () => writer.ReadPendingSinceAsync(DateTime.MinValue, batchSize: -3)); + } + + [Fact] + public async Task MarkReconciledAsync_FlipsPendingAndForwarded_To_Reconciled() + { + var (writer, dataSource) = CreateWriter(nameof(MarkReconciledAsync_FlipsPendingAndForwarded_To_Reconciled)); + await using var _ = writer; + + var a = NewEvent(); + var b = NewEvent(); + var c = NewEvent(); + await writer.WriteAsync(a); + await writer.WriteAsync(b); + await writer.WriteAsync(c); + + // b is currently Forwarded; a and c are Pending. + await writer.MarkForwardedAsync(new[] { b.EventId }); + + await writer.MarkReconciledAsync(new[] { a.EventId, b.EventId, c.EventId }); + + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "SELECT ForwardState, COUNT(*) FROM AuditLog GROUP BY ForwardState;"; + using var reader = cmd.ExecuteReader(); + var byState = new Dictionary(); + while (reader.Read()) + { + byState[reader.GetString(0)] = reader.GetInt64(1); + } + + Assert.Equal(3, byState[AuditForwardState.Reconciled.ToString()]); + Assert.False(byState.ContainsKey(AuditForwardState.Pending.ToString())); + Assert.False(byState.ContainsKey(AuditForwardState.Forwarded.ToString())); + } + + [Fact] + public async Task MarkReconciledAsync_Idempotent_LeavesAlreadyReconciledRowsUntouched() + { + var (writer, dataSource) = CreateWriter(nameof(MarkReconciledAsync_Idempotent_LeavesAlreadyReconciledRowsUntouched)); + await using var _ = writer; + + var a = NewEvent(); + await writer.WriteAsync(a); + await writer.MarkReconciledAsync(new[] { a.EventId }); + // Re-call must not throw and must leave the single row Reconciled. + await writer.MarkReconciledAsync(new[] { a.EventId }); + + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "SELECT ForwardState FROM AuditLog WHERE EventId = $id;"; + cmd.Parameters.AddWithValue("$id", a.EventId.ToString()); + + Assert.Equal(AuditForwardState.Reconciled.ToString(), cmd.ExecuteScalar() as string); + } + + [Fact] + public async Task MarkReconciledAsync_NonExistentId_NoThrow() + { + var (writer, _) = CreateWriter(nameof(MarkReconciledAsync_NonExistentId_NoThrow)); + await using var _w = writer; + + await writer.MarkReconciledAsync(new[] { Guid.NewGuid(), Guid.NewGuid() }); + // Completes without throwing. + } } diff --git a/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs index f8bef38..8d5d555 100644 --- a/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs @@ -7,6 +7,7 @@ using NSubstitute; using NSubstitute.ExceptionExtensions; using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types.Enums; using ScadaLink.Communication.Grpc; diff --git a/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs b/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs new file mode 100644 index 0000000..ba9ae37 --- /dev/null +++ b/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs @@ -0,0 +1,83 @@ +using Google.Protobuf; +using Google.Protobuf.WellKnownTypes; +using ScadaLink.Communication.Grpc; + +namespace ScadaLink.Communication.Tests.Protos; + +/// +/// Wire-format round-trip tests for the Audit Log (#23) M6 reconciliation +/// pull proto messages (, +/// ). Locks the additive contract the +/// central→site reconciliation puller depends on. +/// +public class PullAuditEventsProtoTests +{ + private static AuditEventDto NewAuditDto(Guid? id = null) => new() + { + EventId = (id ?? Guid.NewGuid()).ToString(), + OccurredAtUtc = Timestamp.FromDateTimeOffset( + new DateTimeOffset(2026, 5, 20, 10, 15, 30, 123, TimeSpan.Zero)), + Channel = "ApiOutbound", + Kind = "ApiCall", + Status = "Delivered", + SourceSiteId = "site-1", + }; + + [Fact] + public void PullAuditEventsRequest_RoundTrip() + { + var sinceUtc = Timestamp.FromDateTimeOffset( + new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero)); + + var original = new PullAuditEventsRequest + { + SinceUtc = sinceUtc, + BatchSize = 250, + }; + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsRequest.Parser.ParseFrom(bytes); + + Assert.Equal(sinceUtc, deserialized.SinceUtc); + Assert.Equal(250, deserialized.BatchSize); + } + + [Fact] + public void PullAuditEventsResponse_RoundTrip_WithEvents_And_MoreAvailable() + { + var dtos = Enumerable.Range(0, 4).Select(_ => NewAuditDto()).ToList(); + + var original = new PullAuditEventsResponse + { + MoreAvailable = true, + }; + original.Events.AddRange(dtos); + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsResponse.Parser.ParseFrom(bytes); + + Assert.True(deserialized.MoreAvailable); + Assert.Equal(4, deserialized.Events.Count); + for (int i = 0; i < dtos.Count; i++) + { + Assert.Equal(dtos[i].EventId, deserialized.Events[i].EventId); + Assert.Equal(dtos[i].Status, deserialized.Events[i].Status); + Assert.Equal(dtos[i].SourceSiteId, deserialized.Events[i].SourceSiteId); + Assert.Equal(dtos[i].OccurredAtUtc, deserialized.Events[i].OccurredAtUtc); + } + } + + [Fact] + public void PullAuditEventsResponse_Empty_Yields_EmptyEvents() + { + var original = new PullAuditEventsResponse(); + Assert.Empty(original.Events); + Assert.False(original.MoreAvailable); + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsResponse.Parser.ParseFrom(bytes); + + Assert.Empty(deserialized.Events); + Assert.False(deserialized.MoreAvailable); + } +} diff --git a/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs b/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs new file mode 100644 index 0000000..d9a6ac2 --- /dev/null +++ b/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs @@ -0,0 +1,185 @@ +using Akka.TestKit.Xunit2; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; +using NSubstitute.ExceptionExtensions; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.Communication.Grpc; + +namespace ScadaLink.Communication.Tests; + +/// +/// Bundle A A2 tests for . +/// Verifies the request → ISiteAuditQueue.ReadPendingSinceAsync → response → +/// MarkReconciledAsync round-trip through the gRPC handler. The queue is an +/// NSubstitute stub so the tests never touch SQLite. +/// +public class SiteStreamPullAuditEventsTests : TestKit +{ + private readonly ISiteStreamSubscriber _subscriber = Substitute.For(); + + private SiteStreamGrpcServer CreateServer() => + new(_subscriber, NullLogger.Instance); + + private static ServerCallContext NewContext(CancellationToken ct = default) + { + var context = Substitute.For(); + context.CancellationToken.Returns(ct); + return context; + } + + private static AuditEvent NewEvent(DateTime? occurredAt = null) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAt + ?? DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 0, 0), DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = "site-1", + PayloadTruncated = false, + ForwardState = AuditForwardState.Pending, + }; + + [Fact] + public async Task PullAuditEvents_NoQueueWired_ReturnsEmptyResponse() + { + var server = CreateServer(); + // Intentionally do NOT call SetSiteAuditQueue — simulates a central-only + // host or a wiring-incomplete startup window. + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddMinutes(-5)), + BatchSize = 100, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Empty(response.Events); + Assert.False(response.MoreAvailable); + } + + [Fact] + public async Task PullAuditEvents_With5PendingRows_ReturnsAllFiveDtos_AndFlipsToReconciled() + { + var queue = Substitute.For(); + var events = Enumerable.Range(0, 5).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, // larger than returned count so MoreAvailable should be false + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(5, response.Events.Count); + Assert.False(response.MoreAvailable); // 5 < 100 + var expectedIds = events.Select(e => e.EventId.ToString()).ToHashSet(); + Assert.True(expectedIds.SetEquals(response.Events.Select(d => d.EventId).ToHashSet())); + + // Verify MarkReconciledAsync received the same 5 ids (best-effort flip). + await queue.Received(1).MarkReconciledAsync( + Arg.Is>(ids => ids.Count == 5 && + ids.ToHashSet().SetEquals(events.Select(e => e.EventId))), + Arg.Any()); + } + + [Fact] + public async Task PullAuditEvents_RowsOlderThanSinceUtc_Excluded() + { + // The handler delegates the since-utc filter to ReadPendingSinceAsync; + // this test verifies it passes the request value through verbatim + // (no clock skew, no off-by-one) and that an empty queue response + // yields an empty gRPC response. + var queue = Substitute.For(); + var capturedSince = DateTime.MinValue; + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + capturedSince = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var since = DateTime.SpecifyKind(new DateTime(2026, 5, 20, 9, 30, 0), DateTimeKind.Utc); + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(since), + BatchSize = 50, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Empty(response.Events); + Assert.False(response.MoreAvailable); + Assert.Equal(since, capturedSince); + // Empty result → no MarkReconciledAsync call (no rows to flip). + await queue.DidNotReceive().MarkReconciledAsync( + Arg.Any>(), Arg.Any()); + } + + [Fact] + public async Task PullAuditEvents_BatchSize3_Returns3Rows_MoreAvailableTrue() + { + var queue = Substitute.For(); + var events = Enumerable.Range(0, 3).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 3, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(3, response.Events.Count); + // saturated batch → central needs to know to issue a follow-up pull + Assert.True(response.MoreAvailable); + } + + [Fact] + public async Task PullAuditEvents_MarkReconciledThrows_ResponseStillReturned() + { + // The Reconciled flip is best-effort — if it fails, the response must + // still surface so central can ingest the rows (and dedup on EventId + // when it pulls them again). + var queue = Substitute.For(); + var events = Enumerable.Range(0, 2).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + queue.MarkReconciledAsync(Arg.Any>(), Arg.Any()) + .ThrowsAsync(new InvalidOperationException("SQLite disposed mid-call")); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, + }; + + // Must NOT throw — the response is built before the flip and returned + // regardless of the flip outcome. + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(2, response.Events.Count); + } +} diff --git a/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs b/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs new file mode 100644 index 0000000..2d8c6c8 --- /dev/null +++ b/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs @@ -0,0 +1,182 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.ConfigurationDatabase.Maintenance; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; +using Xunit; + +namespace ScadaLink.ConfigurationDatabase.Tests.Maintenance; + +/// +/// Bundle D (#23 M6-T5) integration tests for +/// . Uses the same +/// as the AuditLog migration / repository +/// tests so the ALTER PARTITION FUNCTION DDL runs against the actual seeded +/// pf_AuditLog_Month. +/// +/// +/// The migration seeds boundaries for every month in 2026 and 2027 (Jan 2026 +/// through Dec 2027). Tests pick a lookahead relative to the current +/// max-boundary at test start (rather than a fixed-target date) so each test +/// is robust against earlier tests in the class having added boundaries to +/// the shared fixture DB. Tests run sequentially within the class via xunit's +/// per-class collection serialisation. +/// +public class AuditLogPartitionMaintenanceTests : IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public AuditLogPartitionMaintenanceTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + private AuditLogPartitionMaintenance NewMaintenance(ScadaLinkDbContext ctx) => + new(ctx, NullLogger.Instance); + + /// + /// Computes the lookahead-in-months required to fall strictly inside the + /// already-covered boundary range. Picks something well below the + /// distance from "now" to the current max — guaranteed not to need any + /// new SPLIT. + /// + private static int LookaheadInsideExistingRange(DateTime max) + { + var now = DateTime.UtcNow; + // (max - now) in whole months, minus a 1-month safety margin so we + // never accidentally hit the boundary horizon edge case. + var months = ((max.Year - now.Year) * 12) + max.Month - now.Month - 1; + return Math.Max(1, months); + } + + /// + /// Computes the lookahead-in-months required to add exactly + /// new boundaries past the current max. + /// + /// + /// EnsureLookaheadAsync defines horizon = + /// NormalizeToFirstOfMonth(UtcNow) + lookaheadMonths. The new + /// boundaries it issues are first-of-month values strictly greater than + /// max, up to and including horizon. So + /// lookaheadMonths = monthsBetween(NormalizeToFirstOfMonth(UtcNow), max) + extraBoundaries + /// is the exact value that lands horizon on max + extraBoundaries + /// months. + /// + private static int LookaheadForExtraBoundaries(DateTime max, int extraBoundaries) + { + var nowFirstOfMonth = FirstOfNextMonth(DateTime.UtcNow); + var monthsToMax = ((max.Year - nowFirstOfMonth.Year) * 12) + max.Month - nowFirstOfMonth.Month; + return monthsToMax + extraBoundaries; + } + + private static DateTime FirstOfNextMonth(DateTime instant) + { + var firstOfThisMonth = new DateTime(instant.Year, instant.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return firstOfThisMonth.AddMonths(1); + } + + [SkippableFact] + public async Task EnsureLookahead_AlreadyHasFutureRange_NoSplit_ReturnsEmpty() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var max = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(max); + + // Pick a lookahead small enough that horizon (NormalizeToFirstOfMonth(now) + // + lookahead) lands well INSIDE the already-covered range — no SPLIT + // should fire. + var lookahead = LookaheadInsideExistingRange(max.Value); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Empty(added); + + // Sanity: the max boundary is unchanged after the no-op call. + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(max, maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_NeedsOneMoreBoundary_Splits_Returns1Boundary() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var maxBefore = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(maxBefore); + + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 1); + var expectedAdded = maxBefore.Value.AddMonths(1); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Single(added); + Assert.Equal(expectedAdded, added[0]); + + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(expectedAdded, maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_NeedsThreeBoundaries_Splits_Returns3Boundaries() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var maxBefore = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(maxBefore); + + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 3); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Equal(3, added.Count); + Assert.Equal(maxBefore.Value.AddMonths(1), added[0]); + Assert.Equal(maxBefore.Value.AddMonths(2), added[1]); + Assert.Equal(maxBefore.Value.AddMonths(3), added[2]); + + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(maxBefore.Value.AddMonths(3), maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_BoundaryAlreadyExists_NoError_Idempotent() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx1 = CreateContext(); + var m1 = NewMaintenance(ctx1); + + var maxStart = await m1.GetMaxBoundaryAsync(); + Assert.NotNull(maxStart); + + // First call: add one boundary. + var lookahead = LookaheadForExtraBoundaries(maxStart.Value, extraBoundaries: 1); + var firstAdded = await m1.EnsureLookaheadAsync(lookahead); + Assert.Single(firstAdded); + + // Second call: the boundary just added is now part of pf_AuditLog_Month, + // so the same lookahead value should be a no-op — no exception, no + // duplicate SPLIT. + await using var ctx2 = CreateContext(); + var m2 = NewMaintenance(ctx2); + var secondAdded = await m2.EnsureLookaheadAsync(lookahead); + + Assert.Empty(secondAdded); + + // The max boundary is unchanged across the second call. + var maxAfter = await m2.GetMaxBoundaryAsync(); + Assert.Equal(firstAdded[0], maxAfter); + } +} diff --git a/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs b/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs index 958b2b1..df1daeb 100644 --- a/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs +++ b/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs @@ -1,3 +1,4 @@ +using Microsoft.Data.SqlClient; using Microsoft.EntityFrameworkCore; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Types.Audit; @@ -309,21 +310,221 @@ public class AuditLogRepositoryTests : IClassFixture Assert.True(events.Select(e => e.EventId).ToHashSet().SetEquals(allIds)); } + // ------------------------------------------------------------------------ + // M6-T4 Bundle C: SwitchOutPartitionAsync drop-and-rebuild integration tests + // ------------------------------------------------------------------------ + // + // The partition-switch path replaces M1's NotSupportedException stub with + // the production drop-DROP-INDEX → CREATE-staging → SWITCH PARTITION → + // DROP-staging → CREATE-INDEX dance documented in alog.md §4. These tests + // verify the side effects an outsider can observe: + // * rows in the targeted month are removed + // * rows in OTHER months are NOT touched + // * UX_AuditLog_EventId still exists after a successful switch + // * InsertIfNotExistsAsync's first-write-wins idempotency still holds + // after a switch (the rebuilt index is real) + // * a thrown SqlException leaves UX_AuditLog_EventId rebuilt (the CATCH + // branch's recovery path runs) + [SkippableFact] - public async Task SwitchOutPartitionAsync_ThrowsNotSupported_ForM1() + public async Task SwitchOutPartitionAsync_OldPartition_RemovesRows_NewPartitionsKept() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Three distinct months — Jan, Feb, Mar 2026 — so the switch on Jan's + // boundary purges exactly one month's worth of rows. Boundary values + // come from the partition function's pre-seeded list (alog.md §4). + var janEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 1, 15, 10, 0, 0, DateTimeKind.Utc)); + var febEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 2, 15, 10, 0, 0, DateTimeKind.Utc)); + var marEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 3, 15, 10, 0, 0, DateTimeKind.Utc)); + await repo.InsertIfNotExistsAsync(janEvt); + await repo.InsertIfNotExistsAsync(febEvt); + await repo.InsertIfNotExistsAsync(marEvt); + + // Boundary value '2026-01-01' identifies the January 2026 partition under + // RANGE RIGHT semantics ($PARTITION returns the partition into which the + // boundary value itself falls — the partition whose lower bound is the + // boundary). + await repo.SwitchOutPartitionAsync(new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc)); + + await using var readContext = CreateContext(); + var remaining = await readContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.DoesNotContain(remaining, e => e.EventId == janEvt.EventId); + Assert.Contains(remaining, e => e.EventId == febEvt.EventId); + Assert.Contains(remaining, e => e.EventId == marEvt.EventId); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_RebuildsUxIndex_AfterSwitch() { Skip.IfNot(_fixture.Available, _fixture.SkipReason); await using var context = CreateContext(); var repo = new AuditLogRepository(context); - // The partition-switch path is intentionally blocked in M1 because - // UX_AuditLog_EventId is non-aligned. The drop-and-rebuild dance ships - // with the M6 purge actor. - var ex = await Assert.ThrowsAsync( - () => repo.SwitchOutPartitionAsync(new DateTime(2026, 2, 1, 0, 0, 0, DateTimeKind.Utc))); + // Pick a different month per test so successive test runs (which share + // the fixture's MSSQL database) don't tread on each other. + await repo.SwitchOutPartitionAsync(new DateTime(2026, 4, 1, 0, 0, 0, DateTimeKind.Utc)); - Assert.Contains("M6", ex.Message, StringComparison.OrdinalIgnoreCase); + await using var verifyContext = CreateContext(); + var indexExists = await ScalarAsync( + verifyContext, + "SELECT COUNT(*) FROM sys.indexes " + + "WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');"); + Assert.Equal(1, indexExists); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_InsertIfNotExistsAsync_StillEnforcesFirstWriteWins_AfterSwitch() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Pre-existing row in May 2026 — must survive a switch on a different + // (older) partition. + var preExisting = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc)); + await repo.InsertIfNotExistsAsync(preExisting); + + // Switch out the June 2026 partition (different month, empty). + await repo.SwitchOutPartitionAsync(new DateTime(2026, 6, 1, 0, 0, 0, DateTimeKind.Utc)); + + // Re-attempting the same EventId after the switch must STILL be a no-op + // (UX_AuditLog_EventId is the index that enables idempotency; if the + // rebuild left it broken, this insert would silently produce a duplicate + // row and the count assertion below would catch it). + var dup = preExisting with { ErrorMessage = "second-should-be-ignored-after-switch" }; + await repo.InsertIfNotExistsAsync(dup); + + await using var readContext = CreateContext(); + var rows = await readContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.Single(rows); + Assert.Equal(preExisting.EventId, rows[0].EventId); + // First-write-wins: the original ErrorMessage (null) survives. + Assert.Null(rows[0].ErrorMessage); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_PartialFailure_RebuildsUxIndex_RaisesException() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Force a deterministic switch failure with an inbound FOREIGN KEY: + // ALTER TABLE … SWITCH refuses to move rows out of a partition that's + // referenced by an FK from another table, raising msg 4928 + // ("ALTER TABLE SWITCH statement failed because target table … has a + // foreign key …"). The CATCH branch then rolls back and rebuilds the + // unique index — which the assertion below verifies. + // + // The probe table is uniquely named with a guid suffix so reruns of + // this test inside the same fixture DB never collide. We clean it up + // in the finally so the constraint never leaks into other tests. + var probeTable = $"AuditFkProbe_{Guid.NewGuid():N}".Substring(0, 32); + await using (var setup = new SqlConnection(_fixture.ConnectionString)) + { + await setup.OpenAsync(); + await using var cmd = setup.CreateCommand(); + // Composite FK references AuditLog's composite PK (EventId, OccurredAtUtc). + cmd.CommandText = + $"CREATE TABLE dbo.[{probeTable}] ( " + + $" EventId uniqueidentifier NOT NULL, " + + $" OccurredAtUtc datetime2(7) NOT NULL, " + + $" CONSTRAINT FK_{probeTable}_AuditLog FOREIGN KEY (EventId, OccurredAtUtc) " + + $" REFERENCES dbo.AuditLog(EventId, OccurredAtUtc));"; + await cmd.ExecuteNonQueryAsync(); + } + + try + { + var ex = await Assert.ThrowsAnyAsync( + () => repo.SwitchOutPartitionAsync(new DateTime(2026, 9, 1, 0, 0, 0, DateTimeKind.Utc))); + // Smoke-check the message references the SWITCH statement so we + // know we hit the engineered failure, not some unrelated error. + Assert.Contains("SWITCH", ex.Message, StringComparison.OrdinalIgnoreCase); + } + finally + { + // Always drop the probe table so the FK is gone before the next + // test runs against the shared fixture. + await using var cleanup = new SqlConnection(_fixture.ConnectionString); + await cleanup.OpenAsync(); + await using var cmd = cleanup.CreateCommand(); + cmd.CommandText = + $"IF OBJECT_ID('dbo.[{probeTable}]', 'U') IS NOT NULL DROP TABLE dbo.[{probeTable}];"; + await cmd.ExecuteNonQueryAsync(); + } + + // The CATCH block in the production SQL guarantees UX_AuditLog_EventId + // is rebuilt regardless of which step failed inside the TRY. + await using var verifyContext = CreateContext(); + var indexExists = await ScalarAsync( + verifyContext, + "SELECT COUNT(*) FROM sys.indexes " + + "WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');"); + Assert.Equal(1, indexExists); + } + + // ------------------------------------------------------------------------ + // M6-T4 Bundle C: GetPartitionBoundariesOlderThanAsync + // ------------------------------------------------------------------------ + + [SkippableFact] + public async Task GetPartitionBoundariesOlderThanAsync_ReturnsBoundaries_WithMaxOccurredOlderThanThreshold() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Seed events in two months: July 2026 (old) and August 2026 (new). + await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 7, 10, 0, 0, 0, DateTimeKind.Utc))); + await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 8, 10, 0, 0, 0, DateTimeKind.Utc))); + + // Threshold = Aug 1 2026 — July partition's MAX (July 10) is older; + // August partition's MAX (August 10) is newer. We expect only the July + // boundary back. + var threshold = new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc); + var boundaries = await repo.GetPartitionBoundariesOlderThanAsync(threshold); + + // The repo may also return EARLIER boundaries that have no data (their + // MAX is NULL → treated as "no data, nothing to purge" by the contract). + // We only assert the inclusion/exclusion that matters for our seeded + // rows. + Assert.Contains(new DateTime(2026, 7, 1, 0, 0, 0, DateTimeKind.Utc), boundaries); + Assert.DoesNotContain(new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc), boundaries); + } + + private async Task ScalarAsync(ScadaLinkDbContext context, string sql) + { + var conn = context.Database.GetDbConnection(); + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(); + } + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + var result = await cmd.ExecuteScalarAsync(); + if (result is null || result is DBNull) + { + return default!; + } + return (T)Convert.ChangeType(result, typeof(T) == typeof(string) ? typeof(string) : Nullable.GetUnderlyingType(typeof(T)) ?? typeof(T))!; } // --- helpers ------------------------------------------------------------ diff --git a/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs new file mode 100644 index 0000000..a57f773 --- /dev/null +++ b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs @@ -0,0 +1,73 @@ +using ScadaLink.Commons.Types; + +namespace ScadaLink.HealthMonitoring.Tests; + +/// +/// Bundle E (M6-T6) regression coverage. The site-side audit-log SQLite writer +/// exposes a backlog snapshot (SiteAuditBacklogSnapshot) via the +/// ISiteAuditQueue.GetBacklogStatsAsync surface. A periodic +/// SiteAuditBacklogReporter hosted service polls that snapshot and +/// pushes it into the collector via +/// so the next includes it in +/// the report payload as SiteAuditBacklog. Unlike the +/// SiteAuditWriteFailures / AuditRedactionFailure interval counters, the +/// backlog snapshot is not reset on collect — the field carries forward +/// whatever the most recent refresh pushed in. +/// +public class SiteAuditBacklogMetricTests +{ + private readonly SiteHealthCollector _collector = new(); + + [Fact] + public void Update_Then_CollectReport_IncludesBacklog() + { + var snapshot = new SiteAuditBacklogSnapshot( + PendingCount: 42, + OldestPendingUtc: new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc), + OnDiskBytes: 1234567); + + _collector.UpdateSiteAuditBacklog(snapshot); + + var report = _collector.CollectReport("site-1"); + + Assert.Equal(snapshot, report.SiteAuditBacklog); + } + + [Fact] + public void Report_Payload_Includes_SiteAuditBacklog_AsNullByDefault() + { + // No refresh has been pushed yet — the report carries null so the + // central UI can distinguish "no data yet" from "queue empty". + var report = _collector.CollectReport("site-1"); + + Assert.Null(report.SiteAuditBacklog); + } + + [Fact] + public void CollectReport_DoesNotReset_SiteAuditBacklog() + { + // Backlog snapshot is a point-in-time reading, not a per-interval + // counter — successive CollectReport calls before the next + // SiteAuditBacklogReporter tick MUST keep returning the same snapshot + // so a slow refresh cadence doesn't blank the central dashboard. + var snapshot = new SiteAuditBacklogSnapshot( + PendingCount: 7, + OldestPendingUtc: null, + OnDiskBytes: 8192); + + _collector.UpdateSiteAuditBacklog(snapshot); + + var first = _collector.CollectReport("site-1"); + var second = _collector.CollectReport("site-1"); + + Assert.Equal(snapshot, first.SiteAuditBacklog); + Assert.Equal(snapshot, second.SiteAuditBacklog); + } + + [Fact] + public void Update_With_Null_Throws_ArgumentNullException() + { + Assert.Throws( + () => _collector.UpdateSiteAuditBacklog(null!)); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs index aec5917..e514acd 100644 --- a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs @@ -71,6 +71,7 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable public void IncrementDeadLetter() { } public void IncrementSiteAuditWriteFailures() { } public void IncrementAuditRedactionFailure() { } + public void UpdateSiteAuditBacklog(ScadaLink.Commons.Types.SiteAuditBacklogSnapshot snapshot) { } public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { } public void RemoveConnection(string connectionName) { } public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved) { }