fix(configdb): replace SwitchOutPartitionAsync stub with drop-and-rebuild dance (#23 M6)
Replaces M1's NotSupportedException stub with the production drop-DROP-INDEX → CREATE-staging → SWITCH PARTITION → DROP-staging → CREATE-INDEX dance documented in alog.md §4. UX_AuditLog_EventId is intentionally non-aligned with ps_AuditLog_Month so single-column EventId uniqueness can be enforced cheaply for InsertIfNotExistsAsync; SQL Server rejects ALTER TABLE SWITCH while a non-aligned unique index is present, so the implementation drops it, switches the partition data into a GUID-suffixed staging table on [PRIMARY], drops staging (discarding the rows), and rebuilds the unique index — all inside an explicit transaction with a CATCH that guarantees the unique index is rebuilt regardless of failure point. Also adds GetPartitionBoundariesOlderThanAsync to IAuditLogRepository: a CROSS APPLY over sys.partition_range_values + per-partition MAX(OccurredAtUtc) to enumerate retention-eligible months for the M6 purge actor (next commit). Tests verify: * Old partition's rows are removed; other months untouched * UX_AuditLog_EventId is rebuilt after a successful switch * InsertIfNotExistsAsync's first-write-wins idempotency still holds after switch * On engineered SWITCH failure (inbound FK from a probe table), SqlException propagates AND UX_AuditLog_EventId is still present (CATCH branch ran) * GetPartitionBoundariesOlderThanAsync returns only boundaries whose partition's MAX(OccurredAtUtc) is strictly older than the threshold; empty partitions excluded
This commit is contained in:
@@ -179,18 +179,199 @@ VALUES
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M1 honest contract: throws <see cref="NotSupportedException"/>. The
|
||||
/// <c>UX_AuditLog_EventId</c> unique index is non-aligned with
|
||||
/// <c>ps_AuditLog_Month</c> (it lives on <c>[PRIMARY]</c> to keep
|
||||
/// <see cref="InsertIfNotExistsAsync"/> cheap), and SQL Server rejects
|
||||
/// <c>ALTER TABLE … SWITCH PARTITION</c> when a non-aligned index is present.
|
||||
/// The drop-and-rebuild dance that makes the switch legal ships with the M6
|
||||
/// purge actor.
|
||||
/// M6-T4 production implementation of the drop-and-rebuild dance documented
|
||||
/// on <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/>.
|
||||
/// </summary>
|
||||
public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default)
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The staging table name is GUID-suffixed so concurrent purge attempts on
|
||||
/// different boundaries cannot collide. The staging schema is byte-identical
|
||||
/// to the live <c>AuditLog</c> table (same column types, lengths,
|
||||
/// nullability, and clustered-key shape) — SQL Server's
|
||||
/// <c>ALTER TABLE … SWITCH PARTITION</c> rejects any drift. Keep this CREATE
|
||||
/// in sync with both the migration that ships the live table
|
||||
/// (<c>20260520142214_AddAuditLogTable</c>) and
|
||||
/// <c>AuditLogEntityTypeConfiguration</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// All five steps run inside an explicit transaction so the SWITCH +
|
||||
/// staging-DROP are atomic from the perspective of a consumer reading via
|
||||
/// snapshot isolation; the CATCH rolls back and runs an idempotent
|
||||
/// "rebuild UX_AuditLog_EventId if it doesn't exist" so a partial failure
|
||||
/// never leaves the live table without its idempotency-supporting unique
|
||||
/// index.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public async Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
"AuditLog partition switch is blocked by the non-aligned UX_AuditLog_EventId " +
|
||||
"unique index; the drop-and-rebuild dance ships in M6 (purge actor).");
|
||||
// GUID-suffixed staging name: prevents collision with any concurrent
|
||||
// purge attempt and avoids polluting the AuditLog object namespace with
|
||||
// a predictable identifier.
|
||||
var stagingTableName = $"AuditLog_Staging_{Guid.NewGuid():N}";
|
||||
|
||||
// ISO 8601 in UTC — SQL Server's datetime2 literal parser accepts this
|
||||
// unambiguously and the value is round-trip-safe across SET DATEFORMAT
|
||||
// settings.
|
||||
var monthBoundaryStr = monthBoundary.ToUniversalTime().ToString("yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
var sql = $@"
|
||||
BEGIN TRY
|
||||
BEGIN TRANSACTION;
|
||||
|
||||
-- 1. Drop the non-aligned unique index. ALTER TABLE SWITCH refuses
|
||||
-- to run while it exists.
|
||||
IF EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog'))
|
||||
DROP INDEX UX_AuditLog_EventId ON dbo.AuditLog;
|
||||
|
||||
-- 2. Staging table on [PRIMARY] (non-partitioned) with column shapes
|
||||
-- byte-identical to dbo.AuditLog. Any drift here causes SWITCH to
|
||||
-- reject the operation with msg 4904/4915.
|
||||
CREATE TABLE dbo.[{stagingTableName}] (
|
||||
EventId uniqueidentifier NOT NULL,
|
||||
OccurredAtUtc datetime2(7) NOT NULL,
|
||||
IngestedAtUtc datetime2(7) NULL,
|
||||
Channel varchar(32) NOT NULL,
|
||||
Kind varchar(32) NOT NULL,
|
||||
CorrelationId uniqueidentifier NULL,
|
||||
SourceSiteId varchar(64) NULL,
|
||||
SourceInstanceId varchar(128) NULL,
|
||||
SourceScript varchar(128) NULL,
|
||||
Actor varchar(128) NULL,
|
||||
Target varchar(256) NULL,
|
||||
Status varchar(32) NOT NULL,
|
||||
HttpStatus int NULL,
|
||||
DurationMs int NULL,
|
||||
ErrorMessage nvarchar(1024) NULL,
|
||||
ErrorDetail nvarchar(max) NULL,
|
||||
RequestSummary nvarchar(max) NULL,
|
||||
ResponseSummary nvarchar(max) NULL,
|
||||
PayloadTruncated bit NOT NULL,
|
||||
Extra nvarchar(max) NULL,
|
||||
ForwardState varchar(32) NULL,
|
||||
CONSTRAINT PK_{stagingTableName} PRIMARY KEY CLUSTERED (EventId, OccurredAtUtc)
|
||||
) ON [PRIMARY];
|
||||
|
||||
-- 3. Switch the partition out. $partition.pf_AuditLog_Month returns
|
||||
-- the partition number that contains the supplied boundary value;
|
||||
-- SWITCH PARTITION N moves that partition's pages to the staging
|
||||
-- table (metadata-only, no row copying).
|
||||
DECLARE @partitionNumber int = $partition.pf_AuditLog_Month('{monthBoundaryStr}');
|
||||
DECLARE @sql nvarchar(max) = 'ALTER TABLE dbo.AuditLog SWITCH PARTITION ' + CAST(@partitionNumber AS nvarchar(10)) + ' TO dbo.[{stagingTableName}];';
|
||||
EXEC sp_executesql @sql;
|
||||
|
||||
-- 4. Drop staging — the rows are discarded here. This is the purge.
|
||||
DROP TABLE dbo.[{stagingTableName}];
|
||||
|
||||
-- 5. Rebuild the non-aligned unique index. Live traffic that hit the
|
||||
-- table during steps 1-4 saw composite-PK uniqueness only; from
|
||||
-- here on, single-column EventId uniqueness is restored.
|
||||
CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY];
|
||||
|
||||
COMMIT TRANSACTION;
|
||||
END TRY
|
||||
BEGIN CATCH
|
||||
IF @@TRANCOUNT > 0 ROLLBACK TRANSACTION;
|
||||
|
||||
-- Best-effort staging cleanup. The DROP INDEX in step 1 is now
|
||||
-- rolled back (so the index is back), but the staging table from
|
||||
-- step 2 may or may not survive the rollback depending on the
|
||||
-- failure point. Guard the DROP so a missing staging table doesn't
|
||||
-- mask the original error.
|
||||
IF OBJECT_ID('dbo.[{stagingTableName}]', 'U') IS NOT NULL DROP TABLE dbo.[{stagingTableName}];
|
||||
|
||||
-- Idempotent index rebuild — covers the niche case where ROLLBACK
|
||||
-- failed to restore UX_AuditLog_EventId (or the failure happened
|
||||
-- AFTER the COMMIT, which shouldn't be possible inside this TRY
|
||||
-- but is cheap insurance). Without this, a failed switch could
|
||||
-- leave the live table without its idempotency-supporting index.
|
||||
IF NOT EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog'))
|
||||
CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY];
|
||||
|
||||
-- Surface the original error to the caller — the purge actor logs
|
||||
-- and continues with the next boundary.
|
||||
THROW;
|
||||
END CATCH;";
|
||||
|
||||
await _context.Database.ExecuteSqlRawAsync(sql, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the set of <c>pf_AuditLog_Month</c> boundaries whose partition's
|
||||
/// <c>MAX(OccurredAtUtc)</c> is strictly older than <paramref name="threshold"/>.
|
||||
/// Boundaries with empty partitions are excluded — purging an empty
|
||||
/// partition is wasted I/O.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The CTE pulls every boundary value defined by the partition function and
|
||||
/// joins it (via <c>$PARTITION.pf_AuditLog_Month</c>) to the live AuditLog
|
||||
/// to compute per-partition <c>MAX(OccurredAtUtc)</c>. The outer filter
|
||||
/// keeps only those whose MAX is non-NULL (partition has rows) AND strictly
|
||||
/// less than the threshold (every row is past retention).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Note: the query scans the live <c>OccurredAtUtc</c> column to compute
|
||||
/// the MAX per partition. With <c>IX_AuditLog_OccurredAtUtc</c> on the
|
||||
/// partition-aligned scheme this is a single index seek per partition; for
|
||||
/// 24 partitions and a daily purge cadence the cost is negligible.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public async Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
|
||||
DateTime threshold,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var thresholdUtc = threshold.ToUniversalTime();
|
||||
var thresholdStr = thresholdUtc.ToString("yyyy-MM-dd HH:mm:ss.fffffff");
|
||||
|
||||
// Per-partition MAX over the live table. We materialise the boundary
|
||||
// list first (24 rows) then LEFT JOIN to the MAX aggregate so empty
|
||||
// partitions surface as NULL and get filtered out by the WHERE clause.
|
||||
var sql = $@"
|
||||
WITH Boundaries AS (
|
||||
SELECT CAST(rv.value AS datetime2(7)) AS BoundaryValue,
|
||||
rv.boundary_id AS BoundaryId
|
||||
FROM sys.partition_range_values rv
|
||||
INNER JOIN sys.partition_functions pf ON rv.function_id = pf.function_id
|
||||
WHERE pf.name = 'pf_AuditLog_Month'
|
||||
)
|
||||
SELECT b.BoundaryValue
|
||||
FROM Boundaries b
|
||||
CROSS APPLY (
|
||||
SELECT MAX(a.OccurredAtUtc) AS MaxOccurredAt
|
||||
FROM dbo.AuditLog a
|
||||
WHERE $PARTITION.pf_AuditLog_Month(a.OccurredAtUtc) = b.BoundaryId + 1
|
||||
) x
|
||||
WHERE x.MaxOccurredAt IS NOT NULL
|
||||
AND x.MaxOccurredAt < CAST('{thresholdStr}' AS datetime2(7))
|
||||
ORDER BY b.BoundaryValue;";
|
||||
|
||||
var conn = _context.Database.GetDbConnection();
|
||||
var openedHere = false;
|
||||
if (conn.State != System.Data.ConnectionState.Open)
|
||||
{
|
||||
await conn.OpenAsync(ct).ConfigureAwait(false);
|
||||
openedHere = true;
|
||||
}
|
||||
|
||||
var results = new List<DateTime>();
|
||||
try
|
||||
{
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = sql;
|
||||
await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false);
|
||||
while (await reader.ReadAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
results.Add(reader.GetDateTime(0));
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (openedHere)
|
||||
{
|
||||
await conn.CloseAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user