fix(configdb): replace SwitchOutPartitionAsync stub with drop-and-rebuild dance (#23 M6)

Replaces M1's NotSupportedException stub with the production drop-DROP-INDEX
→ CREATE-staging → SWITCH PARTITION → DROP-staging → CREATE-INDEX dance
documented in alog.md §4. UX_AuditLog_EventId is intentionally non-aligned
with ps_AuditLog_Month so single-column EventId uniqueness can be enforced
cheaply for InsertIfNotExistsAsync; SQL Server rejects ALTER TABLE SWITCH
while a non-aligned unique index is present, so the implementation drops
it, switches the partition data into a GUID-suffixed staging table on
[PRIMARY], drops staging (discarding the rows), and rebuilds the unique
index — all inside an explicit transaction with a CATCH that guarantees
the unique index is rebuilt regardless of failure point.

Also adds GetPartitionBoundariesOlderThanAsync to IAuditLogRepository: a
CROSS APPLY over sys.partition_range_values + per-partition MAX(OccurredAtUtc)
to enumerate retention-eligible months for the M6 purge actor (next commit).

Tests verify:
* Old partition's rows are removed; other months untouched
* UX_AuditLog_EventId is rebuilt after a successful switch
* InsertIfNotExistsAsync's first-write-wins idempotency still holds after switch
* On engineered SWITCH failure (inbound FK from a probe table), SqlException
  propagates AND UX_AuditLog_EventId is still present (CATCH branch ran)
* GetPartitionBoundariesOlderThanAsync returns only boundaries whose partition's
  MAX(OccurredAtUtc) is strictly older than the threshold; empty partitions
  excluded
This commit is contained in:
Joseph Doherty
2026-05-20 18:20:55 -04:00
parent c763bd9a04
commit 6069a20e0f
5 changed files with 445 additions and 24 deletions

View File

@@ -216,5 +216,9 @@ public class AuditLogIngestActorTests : TestKit, IClassFixture<MsSqlMigrationFix
public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) =>
_inner.SwitchOutPartitionAsync(monthBoundary, ct);
public Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
DateTime threshold, CancellationToken ct = default) =>
_inner.GetPartitionBoundariesOlderThanAsync(threshold, ct);
}
}

View File

@@ -89,6 +89,10 @@ public class SiteAuditReconciliationActorTests : TestKit, IClassFixture<MsSqlMig
public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) =>
Task.CompletedTask;
public Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
DateTime threshold, CancellationToken ct = default) =>
Task.FromResult<IReadOnlyList<DateTime>>(Array.Empty<DateTime>());
}
/// <summary>

View File

@@ -1,3 +1,4 @@
using Microsoft.Data.SqlClient;
using Microsoft.EntityFrameworkCore;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Types.Audit;
@@ -309,21 +310,221 @@ public class AuditLogRepositoryTests : IClassFixture<MsSqlMigrationFixture>
Assert.True(events.Select(e => e.EventId).ToHashSet().SetEquals(allIds));
}
// ------------------------------------------------------------------------
// M6-T4 Bundle C: SwitchOutPartitionAsync drop-and-rebuild integration tests
// ------------------------------------------------------------------------
//
// The partition-switch path replaces M1's NotSupportedException stub with
// the production drop-DROP-INDEX → CREATE-staging → SWITCH PARTITION →
// DROP-staging → CREATE-INDEX dance documented in alog.md §4. These tests
// verify the side effects an outsider can observe:
// * rows in the targeted month are removed
// * rows in OTHER months are NOT touched
// * UX_AuditLog_EventId still exists after a successful switch
// * InsertIfNotExistsAsync's first-write-wins idempotency still holds
// after a switch (the rebuilt index is real)
// * a thrown SqlException leaves UX_AuditLog_EventId rebuilt (the CATCH
// branch's recovery path runs)
[SkippableFact]
public async Task SwitchOutPartitionAsync_ThrowsNotSupported_ForM1()
public async Task SwitchOutPartitionAsync_OldPartition_RemovesRows_NewPartitionsKept()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
var siteId = NewSiteId();
await using var context = CreateContext();
var repo = new AuditLogRepository(context);
// Three distinct months — Jan, Feb, Mar 2026 — so the switch on Jan's
// boundary purges exactly one month's worth of rows. Boundary values
// come from the partition function's pre-seeded list (alog.md §4).
var janEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 1, 15, 10, 0, 0, DateTimeKind.Utc));
var febEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 2, 15, 10, 0, 0, DateTimeKind.Utc));
var marEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 3, 15, 10, 0, 0, DateTimeKind.Utc));
await repo.InsertIfNotExistsAsync(janEvt);
await repo.InsertIfNotExistsAsync(febEvt);
await repo.InsertIfNotExistsAsync(marEvt);
// Boundary value '2026-01-01' identifies the January 2026 partition under
// RANGE RIGHT semantics ($PARTITION returns the partition into which the
// boundary value itself falls — the partition whose lower bound is the
// boundary).
await repo.SwitchOutPartitionAsync(new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc));
await using var readContext = CreateContext();
var remaining = await readContext.Set<AuditEvent>()
.Where(e => e.SourceSiteId == siteId)
.ToListAsync();
Assert.DoesNotContain(remaining, e => e.EventId == janEvt.EventId);
Assert.Contains(remaining, e => e.EventId == febEvt.EventId);
Assert.Contains(remaining, e => e.EventId == marEvt.EventId);
}
[SkippableFact]
public async Task SwitchOutPartitionAsync_RebuildsUxIndex_AfterSwitch()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
await using var context = CreateContext();
var repo = new AuditLogRepository(context);
// The partition-switch path is intentionally blocked in M1 because
// UX_AuditLog_EventId is non-aligned. The drop-and-rebuild dance ships
// with the M6 purge actor.
var ex = await Assert.ThrowsAsync<NotSupportedException>(
() => repo.SwitchOutPartitionAsync(new DateTime(2026, 2, 1, 0, 0, 0, DateTimeKind.Utc)));
// Pick a different month per test so successive test runs (which share
// the fixture's MSSQL database) don't tread on each other.
await repo.SwitchOutPartitionAsync(new DateTime(2026, 4, 1, 0, 0, 0, DateTimeKind.Utc));
Assert.Contains("M6", ex.Message, StringComparison.OrdinalIgnoreCase);
await using var verifyContext = CreateContext();
var indexExists = await ScalarAsync<int>(
verifyContext,
"SELECT COUNT(*) FROM sys.indexes " +
"WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');");
Assert.Equal(1, indexExists);
}
[SkippableFact]
public async Task SwitchOutPartitionAsync_InsertIfNotExistsAsync_StillEnforcesFirstWriteWins_AfterSwitch()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
var siteId = NewSiteId();
await using var context = CreateContext();
var repo = new AuditLogRepository(context);
// Pre-existing row in May 2026 — must survive a switch on a different
// (older) partition.
var preExisting = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc));
await repo.InsertIfNotExistsAsync(preExisting);
// Switch out the June 2026 partition (different month, empty).
await repo.SwitchOutPartitionAsync(new DateTime(2026, 6, 1, 0, 0, 0, DateTimeKind.Utc));
// Re-attempting the same EventId after the switch must STILL be a no-op
// (UX_AuditLog_EventId is the index that enables idempotency; if the
// rebuild left it broken, this insert would silently produce a duplicate
// row and the count assertion below would catch it).
var dup = preExisting with { ErrorMessage = "second-should-be-ignored-after-switch" };
await repo.InsertIfNotExistsAsync(dup);
await using var readContext = CreateContext();
var rows = await readContext.Set<AuditEvent>()
.Where(e => e.SourceSiteId == siteId)
.ToListAsync();
Assert.Single(rows);
Assert.Equal(preExisting.EventId, rows[0].EventId);
// First-write-wins: the original ErrorMessage (null) survives.
Assert.Null(rows[0].ErrorMessage);
}
[SkippableFact]
public async Task SwitchOutPartitionAsync_PartialFailure_RebuildsUxIndex_RaisesException()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
await using var context = CreateContext();
var repo = new AuditLogRepository(context);
// Force a deterministic switch failure with an inbound FOREIGN KEY:
// ALTER TABLE … SWITCH refuses to move rows out of a partition that's
// referenced by an FK from another table, raising msg 4928
// ("ALTER TABLE SWITCH statement failed because target table … has a
// foreign key …"). The CATCH branch then rolls back and rebuilds the
// unique index — which the assertion below verifies.
//
// The probe table is uniquely named with a guid suffix so reruns of
// this test inside the same fixture DB never collide. We clean it up
// in the finally so the constraint never leaks into other tests.
var probeTable = $"AuditFkProbe_{Guid.NewGuid():N}".Substring(0, 32);
await using (var setup = new SqlConnection(_fixture.ConnectionString))
{
await setup.OpenAsync();
await using var cmd = setup.CreateCommand();
// Composite FK references AuditLog's composite PK (EventId, OccurredAtUtc).
cmd.CommandText =
$"CREATE TABLE dbo.[{probeTable}] ( " +
$" EventId uniqueidentifier NOT NULL, " +
$" OccurredAtUtc datetime2(7) NOT NULL, " +
$" CONSTRAINT FK_{probeTable}_AuditLog FOREIGN KEY (EventId, OccurredAtUtc) " +
$" REFERENCES dbo.AuditLog(EventId, OccurredAtUtc));";
await cmd.ExecuteNonQueryAsync();
}
try
{
var ex = await Assert.ThrowsAnyAsync<SqlException>(
() => repo.SwitchOutPartitionAsync(new DateTime(2026, 9, 1, 0, 0, 0, DateTimeKind.Utc)));
// Smoke-check the message references the SWITCH statement so we
// know we hit the engineered failure, not some unrelated error.
Assert.Contains("SWITCH", ex.Message, StringComparison.OrdinalIgnoreCase);
}
finally
{
// Always drop the probe table so the FK is gone before the next
// test runs against the shared fixture.
await using var cleanup = new SqlConnection(_fixture.ConnectionString);
await cleanup.OpenAsync();
await using var cmd = cleanup.CreateCommand();
cmd.CommandText =
$"IF OBJECT_ID('dbo.[{probeTable}]', 'U') IS NOT NULL DROP TABLE dbo.[{probeTable}];";
await cmd.ExecuteNonQueryAsync();
}
// The CATCH block in the production SQL guarantees UX_AuditLog_EventId
// is rebuilt regardless of which step failed inside the TRY.
await using var verifyContext = CreateContext();
var indexExists = await ScalarAsync<int>(
verifyContext,
"SELECT COUNT(*) FROM sys.indexes " +
"WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');");
Assert.Equal(1, indexExists);
}
// ------------------------------------------------------------------------
// M6-T4 Bundle C: GetPartitionBoundariesOlderThanAsync
// ------------------------------------------------------------------------
[SkippableFact]
public async Task GetPartitionBoundariesOlderThanAsync_ReturnsBoundaries_WithMaxOccurredOlderThanThreshold()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
var siteId = NewSiteId();
await using var context = CreateContext();
var repo = new AuditLogRepository(context);
// Seed events in two months: July 2026 (old) and August 2026 (new).
await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 7, 10, 0, 0, 0, DateTimeKind.Utc)));
await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 8, 10, 0, 0, 0, DateTimeKind.Utc)));
// Threshold = Aug 1 2026 — July partition's MAX (July 10) is older;
// August partition's MAX (August 10) is newer. We expect only the July
// boundary back.
var threshold = new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc);
var boundaries = await repo.GetPartitionBoundariesOlderThanAsync(threshold);
// The repo may also return EARLIER boundaries that have no data (their
// MAX is NULL → treated as "no data, nothing to purge" by the contract).
// We only assert the inclusion/exclusion that matters for our seeded
// rows.
Assert.Contains(new DateTime(2026, 7, 1, 0, 0, 0, DateTimeKind.Utc), boundaries);
Assert.DoesNotContain(new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc), boundaries);
}
private async Task<T> ScalarAsync<T>(ScadaLinkDbContext context, string sql)
{
var conn = context.Database.GetDbConnection();
if (conn.State != System.Data.ConnectionState.Open)
{
await conn.OpenAsync();
}
await using var cmd = conn.CreateCommand();
cmd.CommandText = sql;
var result = await cmd.ExecuteScalarAsync();
if (result is null || result is DBNull)
{
return default!;
}
return (T)Convert.ChangeType(result, typeof(T) == typeof(string) ? typeof(string) : Nullable.GetUnderlyingType(typeof(T)) ?? typeof(T))!;
}
// --- helpers ------------------------------------------------------------