fix(reconcile): expiry-aware pending staging — expired rows no longer block self-heal

This commit is contained in:
Joseph Doherty
2026-06-26 17:23:26 -04:00
parent 6538216b0c
commit fd22f5ce0a
5 changed files with 156 additions and 21 deletions
@@ -164,12 +164,15 @@ public interface IDeploymentManagerRepository
/// reconcile from the other node, or an in-flight deploy), the handler reads that existing row
/// so a second concurrently-missing node can fetch the same pending config with its (multi-use,
/// TTL-bound) token instead of being omitted from the gap. Defensive against >1 row: returns
/// the most recently created one.
/// the most recently created one. When <paramref name="nowUtc"/> is supplied, EXPIRED rows
/// (<c>ExpiresAtUtc &lt;= nowUtc</c>) are filtered out so the caller never receives a token the
/// config-fetch endpoint would 404.
/// </summary>
/// <param name="instanceId">The instance whose pending deployment to read.</param>
/// <param name="nowUtc">When supplied, only rows with <c>ExpiresAtUtc &gt; nowUtc</c> are considered (expired rows excluded).</param>
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
/// <returns>The pending deployment for the instance, or null if none is staged.</returns>
Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default);
Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, DateTimeOffset? nowUtc = null, CancellationToken cancellationToken = default);
/// <summary>
/// Deletes a pending deployment by its deployment ID. No-op if not found. Does NOT
/// call <see cref="SaveChangesAsync"/> — the caller commits, mirroring the other
@@ -135,7 +135,7 @@ public class ReconcileService
// (If the existing row is from an in-flight deploy its config is newer than the
// snapshot — fetching it is still correct; the site's guarded write handles ordering.)
var existing = await _deploymentRepository
.GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, cancellationToken)
.GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, now, cancellationToken)
.ConfigureAwait(false);
if (existing != null)
{
@@ -227,13 +227,24 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
}
/// <inheritdoc />
public Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default)
public Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, DateTimeOffset? nowUtc = null, CancellationToken cancellationToken = default)
{
// At most one pending row per instance by design (supersession + stage-if-absent),
// but order deterministically and take the most recent so a hypothetical duplicate
// never makes the read non-deterministic — mirrors GetCurrentDeploymentStatusAsync.
return _dbContext.Set<PendingDeployment>()
.Where(p => p.InstanceId == instanceId)
var query = _dbContext.Set<PendingDeployment>()
.Where(p => p.InstanceId == instanceId);
// Expiry-aware: never hand back an EXPIRED row. Pending rows are only TTL-purged, so an
// expired-but-unpurged row would otherwise return a token the config-fetch endpoint 404s
// (it correctly rejects expired rows), leaving the node unhealed.
if (nowUtc.HasValue)
{
var now = nowUtc.Value;
query = query.Where(p => p.ExpiresAtUtc > now);
}
return query
.OrderByDescending(p => p.CreatedAtUtc)
.ThenByDescending(p => p.Id)
.FirstOrDefaultAsync(cancellationToken);
@@ -294,23 +305,39 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
DateTimeOffset createdAtUtc, DateTimeOffset expiresAtUtc,
CancellationToken cancellationToken = default)
{
// Insert-if-absent: do NOT supersede an existing pending row. An existing row means an
// in-flight deploy is already delivering to the node; clobbering it could cause the node
// to fetch the reconcile token while the original deliver is mid-flight.
// Self-contained (commits internally), matching PurgeExpiredPendingDeploymentsAsync.
var alreadyPending = await _dbContext.Set<PendingDeployment>()
.AnyAsync(p => p.InstanceId == instanceId, cancellationToken);
if (alreadyPending)
// Treat createdAtUtc as "now". FIRST remove any EXPIRED pending rows for this instance
// (ExpiresAtUtc <= now). Pending rows are only TTL-purged (the periodic purge is still a
// deferred TODO), so an EXPIRED-but-unpurged row would otherwise (a) read as "a deploy is
// in flight" and block staging — handing the node an expired token (HTTP 404) and leaving
// it unhealed — and (b) collide on the DeploymentId UNIQUE index when a reconcile re-stages
// the snapshot's own DeploymentId. Dropping expired rows first fixes both.
var expired = await _dbContext.Set<PendingDeployment>()
.Where(p => p.InstanceId == instanceId && p.ExpiresAtUtc <= createdAtUtc)
.ToListAsync(cancellationToken);
if (expired.Count > 0)
{
return false;
_dbContext.Set<PendingDeployment>().RemoveRange(expired);
}
var pending = new PendingDeployment(
deploymentId, instanceId, revisionHash,
configurationJson, token, createdAtUtc, expiresAtUtc);
await _dbContext.Set<PendingDeployment>().AddAsync(pending, cancellationToken);
// THEN insert-if-absent against still-LIVE rows only. A live pending row means a genuine
// in-flight deploy (or a concurrent reconcile) already owns the slot — do NOT supersede it;
// clobbering it could make the node fetch the reconcile token while the original deliver is
// mid-flight. (Expired rows just removed are disjoint from this future-expiry predicate.)
var liveExists = await _dbContext.Set<PendingDeployment>()
.AnyAsync(p => p.InstanceId == instanceId && p.ExpiresAtUtc > createdAtUtc, cancellationToken);
if (!liveExists)
{
var pending = new PendingDeployment(
deploymentId, instanceId, revisionHash,
configurationJson, token, createdAtUtc, expiresAtUtc);
await _dbContext.Set<PendingDeployment>().AddAsync(pending, cancellationToken);
}
// Self-contained: one SaveChanges flushes the expired-row cleanup and, when staged, the new
// row together (EF orders the delete before the same-DeploymentId insert to satisfy the
// unique index). Returns true only when a fresh row was staged.
await _dbContext.SaveChangesAsync(cancellationToken);
return true;
return !liveExists;
}
// --- Instance lookups for deployment pipeline ---