feat(audit): M5.5 per-channel retention overrides via purge-role bounded delete (T3)

This commit is contained in:
Joseph Doherty
2026-06-16 21:47:50 -04:00
parent 55630b48b6
commit 50b674accc
13 changed files with 583 additions and 3 deletions
@@ -167,6 +167,9 @@ public class AuditLogPurgeActor : ReceiveActor
if (boundaries.Count == 0)
{
// No whole-month partitions are eligible, but per-channel overrides may
// still expire rows earlier than the global window — run them below.
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
return;
}
@@ -202,6 +205,80 @@ public class AuditLogPurgeActor : ReceiveActor
sw.ElapsedMilliseconds);
}
}
// M5.5 (T3): after the channel-blind global partition switch-out, apply any
// per-channel retention overrides that are SHORTER than the global window via
// a bounded, batched row DELETE on the same maintenance path. The global
// switch-out has already dropped whole months older than RetentionDays; these
// deletes only ever expire rows EARLIER than that, so they run last and are a
// strict tightening.
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
}
/// <summary>
/// M5.5 (T3): runs each per-channel retention override whose window is strictly
/// shorter than the global <see cref="AuditLogOptions.RetentionDays"/>, deleting
/// rows of that channel older than the channel-specific threshold via a bounded,
/// batched maintenance-path DELETE. Each channel runs inside its own try/catch so
/// one bad channel does not abandon the others on the same tick, mirroring the
/// per-boundary error isolation of the partition switch-out loop.
/// </summary>
/// <param name="repository">The repository resolved for this tick's DI scope.</param>
private async Task RunPerChannelOverridesAsync(IAuditLogRepository repository)
{
var overrides = _auditOptions.PerChannelRetentionDays;
if (overrides is null || overrides.Count == 0)
{
return;
}
var globalDays = _auditOptions.RetentionDays;
foreach (var (channel, days) in overrides)
{
// Only act when the per-channel window is strictly shorter than the global
// one. Equal/longer windows are already covered by the global partition
// switch-out, so a row DELETE would be redundant work (and a longer window
// is meaningless — the partition is dropped on the global schedule).
if (days >= globalDays)
{
continue;
}
var channelThreshold = DateTime.UtcNow - TimeSpan.FromDays(days);
var sw = Stopwatch.StartNew();
try
{
var rowsDeleted = await repository
.PurgeChannelOlderThanAsync(channel, channelThreshold, _purgeOptions.ChannelPurgeBatchSize)
.ConfigureAwait(false);
sw.Stop();
if (rowsDeleted > 0)
{
_logger.LogInformation(
"Purged {RowsDeleted} AuditLog rows for channel {Channel} older than {Threshold:o} " +
"(per-channel override {Days}d < global {GlobalDays}d) in {DurationMs} ms.",
rowsDeleted,
channel,
channelThreshold,
days,
globalDays,
sw.ElapsedMilliseconds);
}
}
catch (Exception ex)
{
sw.Stop();
_logger.LogError(
ex,
"Failed to apply per-channel retention override for channel {Channel} " +
"({Days}d); other channels continue. Elapsed {DurationMs} ms.",
channel,
days,
sw.ElapsedMilliseconds);
}
}
}
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
@@ -28,6 +28,24 @@ public sealed class AuditLogPurgeOptions
/// <summary>Period of the purge tick in hours (default 24).</summary>
public int IntervalHours { get; set; } = 24;
/// <summary>
/// M5.5 (T3): batch size for the per-channel retention-override row DELETE
/// (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IAuditLogRepository.PurgeChannelOlderThanAsync"/>).
/// Each <c>DELETE TOP (@batch)</c> caps the transaction-log and lock footprint
/// per statement; the repository loops batches until no rows remain. Default
/// 5000 keeps individual deletes short on a busy central DB while still draining
/// a large backlog within a tick. Clamped to a sane minimum in
/// <see cref="ChannelPurgeBatchSize"/>.
/// </summary>
public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000;
/// <summary>
/// Resolves the effective per-channel purge batch size, clamped to at least 1 so
/// a misconfigured <c>0</c>/negative value cannot make the repository's DELETE
/// loop spin or throw.
/// </summary>
public int ChannelPurgeBatchSize => ChannelPurgeBatchSizeConfigured < 1 ? 1 : ChannelPurgeBatchSizeConfigured;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-hour resolution allows. When non-null, takes precedence over
@@ -37,6 +37,33 @@ public sealed class AuditLogOptions
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
public int RetentionDays { get; set; } = 365;
/// <summary>
/// M5.5 (T3) per-channel retention overrides, keyed by the canonical channel name
/// (the <see cref="AuditChannel"/> enum name — e.g. <c>ApiOutbound</c>,
/// <c>DbOutbound</c>, <c>Notification</c>, <c>ApiInbound</c>). The value is a
/// retention window in days that MUST be SHORTER than or equal to the global
/// <see cref="RetentionDays"/>.
/// </summary>
/// <remarks>
/// <para>
/// The global <see cref="RetentionDays"/> window is enforced by month-partition
/// switch-out, which is channel-blind: it can only drop a whole month once every
/// row in it is older than the global window. A per-channel override therefore
/// can only ever expire rows EARLIER than the global purge would — never later
/// (a longer per-channel window is meaningless because the partition switch-out
/// would already have dropped the month). Overrides shorter than the global window
/// are honoured by the purge actor as a bounded, batched row DELETE on the
/// maintenance path (see <c>AuditLogPurgeActor</c>); the append-only writer/ingest
/// role is unaffected.
/// </para>
/// <para>
/// Each value is validated to be in <c>[30, RetentionDays]</c> by
/// <c>AuditLogOptionsValidator</c>; keys that are not recognized
/// <see cref="AuditChannel"/> names are rejected.
/// </para>
/// </remarks>
public Dictionary<string, int> PerChannelRetentionDays { get; set; } = new();
/// <summary>
/// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
/// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
@@ -1,4 +1,5 @@
using ZB.MOM.WW.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
@@ -52,5 +53,27 @@ public sealed class AuditLogOptionsValidator : OptionsValidatorBase<AuditLogOpti
!(options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes),
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
// M5.5 (T3): per-channel retention overrides. Each entry must be keyed by a
// recognized AuditChannel name and carry a window in [MinRetentionDays,
// RetentionDays] — i.e. SHORTER than or equal to the global window. A longer
// per-channel window is meaningless under month-partition switch-out (governed
// by the global window), so it is rejected rather than silently ignored.
foreach (var (channelKey, days) in options.PerChannelRetentionDays)
{
builder.RequireThat(
Enum.TryParse<AuditChannel>(channelKey, ignoreCase: false, out _),
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)} key '{channelKey}' " +
$"is not a recognized channel name. Valid keys: {string.Join(", ", Enum.GetNames<AuditChannel>())}.");
// Valid when days is within [MinRetentionDays, RetentionDays] inclusive.
// The lower bound matches the global RetentionDays floor; the upper bound
// is the configured global window (longer is meaningless — see remarks).
builder.RequireThat(
!(days < MinRetentionDays || days > options.RetentionDays),
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)}['{channelKey}'] ({days}) " +
$"must be in [{MinRetentionDays}, {nameof(AuditLogOptions.RetentionDays)}={options.RetentionDays}] days " +
"— a per-channel window must be shorter than or equal to the global retention window.");
}
}
}
@@ -87,6 +87,42 @@ public interface IAuditLogRepository
/// <returns>A task that resolves to the approximate number of rows discarded by the partition switch.</returns>
Task<long> SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);
/// <summary>
/// M5.5 (T3) per-channel retention override purge. Deletes <c>AuditLog</c> rows for a
/// single <paramref name="channel"/> (matched against the canonical
/// <c>Category</c> column — the bare channel name, e.g. <c>ApiOutbound</c>) whose
/// <c>OccurredAtUtc</c> is strictly older than <paramref name="threshold"/>, in
/// bounded batches of <paramref name="batchSize"/> rows, looping until no further
/// rows match. Returns the total number of rows deleted across all batches.
/// </summary>
/// <remarks>
/// <para>
/// <b>Maintenance path — NOT the writer role.</b> The append-only invariant binds
/// the <c>scadabridge_audit_writer</c> ingest role (INSERT + SELECT only). This row
/// DELETE runs on the purge/maintenance connection, the same path that performs the
/// global partition switch-out (also a destructive operation forbidden to the writer
/// role). Per-channel overrides can only ever expire rows EARLIER than the global
/// month-partition switch-out would — never later — so this is a strict tightening
/// of the retention window, applied AFTER the global purge on the same tick.
/// </para>
/// <para>
/// <b>Bounded + idempotent.</b> Each batch is a <c>DELETE TOP (@batch)</c> so the
/// transaction log and lock footprint stay bounded regardless of backlog. Re-running
/// the purge is a no-op once every eligible row is gone (the loop exits when a batch
/// deletes zero rows), so a crash mid-loop is recoverable by simply running again.
/// </para>
/// </remarks>
/// <param name="channel">Canonical channel name (the <c>Category</c> column value, e.g. <c>ApiOutbound</c>).</param>
/// <param name="threshold">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are deleted.</param>
/// <param name="batchSize">Maximum rows deleted per batch; must be &gt; 0.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to the total number of rows deleted across all batches.</returns>
Task<long> PurgeChannelOlderThanAsync(
string channel,
DateTime threshold,
int batchSize,
CancellationToken ct = default);
/// <summary>
/// Returns the set of <c>pf_AuditLog_Month</c> partition lower-bound
/// boundaries whose partitions contain only rows with
@@ -370,6 +370,99 @@ VALUES
return rowsDeleted;
}
/// <inheritdoc />
public async Task<long> PurgeChannelOlderThanAsync(
string channel,
DateTime threshold,
int batchSize,
CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(channel))
{
throw new ArgumentException("Channel must be a non-empty channel name.", nameof(channel));
}
if (batchSize <= 0)
{
throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
}
var thresholdUtc = DateTime.SpecifyKind(threshold.ToUniversalTime(), DateTimeKind.Utc);
// M5.5 (T3) per-channel retention override purge. This is the ONLY DELETE
// against dbo.AuditLog in the codebase and it runs on the purge/maintenance
// path, NOT the append-only writer role (which has INSERT + SELECT only — see
// the DENY UPDATE/DENY DELETE grants in CollapseAuditLogToCanonical). The
// AuditLog append-only CI guard (AuditLogAppendOnlyGuardTests) is intentionally
// widened to allow ONLY the single marked DELETE below; any other UPDATE/DELETE
// targeting AuditLog still trips the guard.
//
// Bounded + idempotent: DELETE TOP (@batch) caps the log/lock footprint per
// statement; the loop repeats until a batch deletes zero rows, so re-running
// after a crash mid-loop simply resumes. Category is the canonical
// channel-name column (e.g. 'ApiOutbound'); Action holds "{channel}.{kind}" so
// it is NOT the right column to match a bare channel name against.
//
// The trailing AUDIT-PURGE-ALLOWED marker on the DELETE line below is the
// single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
// recognizes; any other UPDATE/DELETE targeting AuditLog still trips the guard.
const string deleteBatchSql =
"DELETE TOP (@batch) FROM dbo.AuditLog WHERE Category = @channel AND OccurredAtUtc < @threshold;"; // AUDIT-PURGE-ALLOWED: per-channel retention override (M5.5 T3), maintenance path
long totalDeleted = 0;
var conn = _context.Database.GetDbConnection();
var openedHere = false;
if (conn.State != System.Data.ConnectionState.Open)
{
await conn.OpenAsync(ct).ConfigureAwait(false);
openedHere = true;
}
try
{
while (true)
{
ct.ThrowIfCancellationRequested();
await using var cmd = conn.CreateCommand();
cmd.CommandText = deleteBatchSql;
var pBatch = cmd.CreateParameter();
pBatch.ParameterName = "@batch";
pBatch.Value = batchSize;
cmd.Parameters.Add(pBatch);
var pChannel = cmd.CreateParameter();
pChannel.ParameterName = "@channel";
pChannel.Value = channel;
cmd.Parameters.Add(pChannel);
var pThreshold = cmd.CreateParameter();
pThreshold.ParameterName = "@threshold";
pThreshold.Value = thresholdUtc;
cmd.Parameters.Add(pThreshold);
var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
if (rows <= 0)
{
break;
}
totalDeleted += rows;
}
}
finally
{
if (openedHere)
{
await conn.CloseAsync().ConfigureAwait(false);
}
}
return totalDeleted;
}
/// <inheritdoc />
public async Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
DateTime threshold,