660fdc4e93
Central singleton (M6-T4 Bundle C) that drives the daily AuditLog partition
purge. On a configurable timer (default 24 hours) the actor:
1. Queries IAuditLogRepository.GetPartitionBoundariesOlderThanAsync for
monthly boundaries whose latest OccurredAtUtc is older than
DateTime.UtcNow - AuditLogOptions.RetentionDays.
2. For each eligible boundary calls SwitchOutPartitionAsync, which runs
the drop-and-rebuild dance around UX_AuditLog_EventId.
3. Publishes AuditLogPurgedEvent(boundary, rowsDeleted, durationMs) on
the actor-system EventStream so the Bundle E central health collector
and ops surfaces can subscribe without coupling to this actor.
Co-changes:
* SwitchOutPartitionAsync returns long (rows deleted) — sampled BEFORE the
switch via COUNT_BIG over the per-partition filter so the count
reflects what the switch removed, not a post-purge scan of a table that
no longer exists. All stub implementations updated.
* AuditLogPurgeOptions: IntervalHours (default 24), IntervalOverride for
tests, Interval property resolving either.
* AuditLogPurgedEvent: record with MonthBoundary, RowsDeleted, DurationMs.
Behavior:
* Continue-on-error per boundary — one partition that throws does NOT
abandon the rest of the tick.
* DI scope opened per tick (IAuditLogRepository is a SCOPED EF Core
service); mirrors SiteAuditReconciliationActor and AuditLogIngestActor.
* SupervisorStrategy Resume keeps the singleton alive across leaked
exceptions.
* EventStream capture BEFORE the first await — Context is unsafe after
await in async receive handlers (same pattern as Sender-capture in
AuditLogIngestActor.OnIngestAsync).
Tests:
* Tick_Fires_OnDailyInterval — visible timer side effect.
* Tick_OldPartitions_SwitchedOut — both seeded boundaries purged.
* Tick_NewerPartitions_Untouched — empty enumerator → no switches.
* Tick_PublishesPurgedEvent_WithRowCount — AuditLogPurgedEvent carries
RowsDeleted and DurationMs.
* Tick_SwitchThrows_OtherPartitionsStillProcessed — continue-on-error.
* Threshold_UsesAuditLogOptionsRetentionDays — non-default 30-day window
computed from UtcNow - RetentionDays.
* EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished — TestKit +
MsSqlMigrationFixture: real partitioned table, Jan-2026 row purged,
Apr-2026 row kept, AuditLogPurgedEvent observed via probe.
215 lines
8.2 KiB
C#
215 lines
8.2 KiB
C#
using System.Diagnostics;
|
|
using Akka.Actor;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using ScadaLink.AuditLog.Configuration;
|
|
using ScadaLink.Commons.Interfaces.Repositories;
|
|
|
|
namespace ScadaLink.AuditLog.Central;
|
|
|
|
/// <summary>
|
|
/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
|
|
/// purge. On a configurable timer (default 24 hours) the actor:
|
|
/// <list type="number">
|
|
/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
|
|
/// for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
|
|
/// than <c>DateTime.UtcNow - RetentionDays</c>.</item>
|
|
/// <item>For each eligible boundary, calls
|
|
/// <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
|
|
/// the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
|
|
/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
|
|
/// EventStream so the Bundle E central health collector + ops surfaces
|
|
/// can subscribe without coupling to this actor.</item>
|
|
/// </list>
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// <b>Daily cadence.</b> Partition switch is metadata-only but the
|
|
/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
|
|
/// more often than necessary trades unique-index rebuild outages for
|
|
/// negligible freshness wins. The default 24-hour interval matches
|
|
/// alog.md §10's retention policy.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
|
|
/// failure, contention with backup, missing object) must NOT prevent the
|
|
/// other eligible boundaries from being purged on the same tick. Per-boundary
|
|
/// work runs inside its own try/catch; the actor's
|
|
/// <see cref="SupervisorStrategy"/> uses Resume so any leaked exception keeps
|
|
/// the singleton alive for the next tick.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
|
/// service registered by <c>AddConfigurationDatabase</c>. The singleton
|
|
/// opens one DI scope per tick and reuses the same repository across every
|
|
/// boundary in that tick — mirrors the
|
|
/// <see cref="SiteAuditReconciliationActor"/> pattern.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
|
|
/// the EventStream rather than direct messaging avoids coupling this actor
|
|
/// to its consumers; M6 Bundle E will subscribe a central health-counter
|
|
/// bridge that surfaces purge progress on the central health report.
|
|
/// </para>
|
|
/// </remarks>
|
|
public class AuditLogPurgeActor : ReceiveActor
|
|
{
|
|
private readonly IServiceProvider _services;
|
|
private readonly AuditLogPurgeOptions _purgeOptions;
|
|
private readonly AuditLogOptions _auditOptions;
|
|
private readonly ILogger<AuditLogPurgeActor> _logger;
|
|
private ICancelable? _timer;
|
|
|
|
public AuditLogPurgeActor(
|
|
IServiceProvider services,
|
|
IOptions<AuditLogPurgeOptions> purgeOptions,
|
|
IOptions<AuditLogOptions> auditOptions,
|
|
ILogger<AuditLogPurgeActor> logger)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(services);
|
|
ArgumentNullException.ThrowIfNull(purgeOptions);
|
|
ArgumentNullException.ThrowIfNull(auditOptions);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_services = services;
|
|
_purgeOptions = purgeOptions.Value;
|
|
_auditOptions = auditOptions.Value;
|
|
_logger = logger;
|
|
|
|
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
|
|
}
|
|
|
|
protected override void PreStart()
|
|
{
|
|
base.PreStart();
|
|
var interval = _purgeOptions.Interval;
|
|
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
|
initialDelay: interval,
|
|
interval: interval,
|
|
receiver: Self,
|
|
message: PurgeTick.Instance,
|
|
sender: Self);
|
|
}
|
|
|
|
protected override void PostStop()
|
|
{
|
|
_timer?.Cancel();
|
|
base.PostStop();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Resume keeps the singleton alive across any leaked exception. Restart
|
|
/// would re-run PreStart and reschedule the timer (harmless but wasteful);
|
|
/// Stop is wrong because the singleton must keep ticking until shutdown.
|
|
/// </summary>
|
|
protected override SupervisorStrategy SupervisorStrategy()
|
|
{
|
|
return new OneForOneStrategy(
|
|
maxNrOfRetries: 0,
|
|
withinTimeRange: TimeSpan.Zero,
|
|
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
|
|
}
|
|
|
|
private async Task OnTickAsync()
|
|
{
|
|
// Capture EventStream BEFORE the first await. Accessing Context (and
|
|
// therefore Context.System) after an await is unsafe because Akka's
|
|
// ActorBase.Context throws "no active ActorContext" once the
|
|
// continuation runs on a thread that isn't currently dispatching this
|
|
// actor — mirrors the same Sender-capture pattern in
|
|
// AuditLogIngestActor.OnIngestAsync.
|
|
var eventStream = Context.System.EventStream;
|
|
|
|
// Compute the retention threshold from AuditLogOptions.RetentionDays
|
|
// each tick — the options class supports hot reload via
|
|
// IOptionsMonitor for the redaction policy and similar settings; we
|
|
// read the snapshot per-tick so an operator who lowers RetentionDays
|
|
// sees the change applied on the next purge without an actor
|
|
// restart.
|
|
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
|
|
|
|
IServiceScope? scope = null;
|
|
IAuditLogRepository repository;
|
|
try
|
|
{
|
|
scope = _services.CreateScope();
|
|
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
|
|
scope?.Dispose();
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
IReadOnlyList<DateTime> boundaries;
|
|
try
|
|
{
|
|
boundaries = await repository
|
|
.GetPartitionBoundariesOlderThanAsync(threshold)
|
|
.ConfigureAwait(false);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(
|
|
ex,
|
|
"Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
|
|
threshold);
|
|
return;
|
|
}
|
|
|
|
if (boundaries.Count == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
foreach (var boundary in boundaries)
|
|
{
|
|
// Per-boundary try/catch: one bad partition (transient SQL
|
|
// failure, missing object, contention with backup) does NOT
|
|
// abandon the rest of the tick.
|
|
var sw = Stopwatch.StartNew();
|
|
try
|
|
{
|
|
var rowsDeleted = await repository
|
|
.SwitchOutPartitionAsync(boundary)
|
|
.ConfigureAwait(false);
|
|
sw.Stop();
|
|
|
|
eventStream.Publish(
|
|
new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
|
|
|
|
_logger.LogInformation(
|
|
"Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
|
|
boundary,
|
|
rowsDeleted,
|
|
sw.ElapsedMilliseconds);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
sw.Stop();
|
|
_logger.LogError(
|
|
ex,
|
|
"Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
|
|
boundary,
|
|
sw.ElapsedMilliseconds);
|
|
}
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
scope.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
|
|
internal sealed class PurgeTick
|
|
{
|
|
public static readonly PurgeTick Instance = new();
|
|
private PurgeTick() { }
|
|
}
|
|
}
|