fix(reconcile): expiry-aware pending staging — expired rows no longer block self-heal
This commit is contained in:
+43
-16
@@ -227,13 +227,24 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default)
|
||||
public Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, DateTimeOffset? nowUtc = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
// At most one pending row per instance by design (supersession + stage-if-absent),
|
||||
// but order deterministically and take the most recent so a hypothetical duplicate
|
||||
// never makes the read non-deterministic — mirrors GetCurrentDeploymentStatusAsync.
|
||||
return _dbContext.Set<PendingDeployment>()
|
||||
.Where(p => p.InstanceId == instanceId)
|
||||
var query = _dbContext.Set<PendingDeployment>()
|
||||
.Where(p => p.InstanceId == instanceId);
|
||||
|
||||
// Expiry-aware: never hand back an EXPIRED row. Pending rows are only TTL-purged, so an
|
||||
// expired-but-unpurged row would otherwise return a token the config-fetch endpoint 404s
|
||||
// (it correctly rejects expired rows), leaving the node unhealed.
|
||||
if (nowUtc.HasValue)
|
||||
{
|
||||
var now = nowUtc.Value;
|
||||
query = query.Where(p => p.ExpiresAtUtc > now);
|
||||
}
|
||||
|
||||
return query
|
||||
.OrderByDescending(p => p.CreatedAtUtc)
|
||||
.ThenByDescending(p => p.Id)
|
||||
.FirstOrDefaultAsync(cancellationToken);
|
||||
@@ -294,23 +305,39 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
|
||||
DateTimeOffset createdAtUtc, DateTimeOffset expiresAtUtc,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// Insert-if-absent: do NOT supersede an existing pending row. An existing row means an
|
||||
// in-flight deploy is already delivering to the node; clobbering it could cause the node
|
||||
// to fetch the reconcile token while the original deliver is mid-flight.
|
||||
// Self-contained (commits internally), matching PurgeExpiredPendingDeploymentsAsync.
|
||||
var alreadyPending = await _dbContext.Set<PendingDeployment>()
|
||||
.AnyAsync(p => p.InstanceId == instanceId, cancellationToken);
|
||||
if (alreadyPending)
|
||||
// Treat createdAtUtc as "now". FIRST remove any EXPIRED pending rows for this instance
|
||||
// (ExpiresAtUtc <= now). Pending rows are only TTL-purged (the periodic purge is still a
|
||||
// deferred TODO), so an EXPIRED-but-unpurged row would otherwise (a) read as "a deploy is
|
||||
// in flight" and block staging — handing the node an expired token (HTTP 404) and leaving
|
||||
// it unhealed — and (b) collide on the DeploymentId UNIQUE index when a reconcile re-stages
|
||||
// the snapshot's own DeploymentId. Dropping expired rows first fixes both.
|
||||
var expired = await _dbContext.Set<PendingDeployment>()
|
||||
.Where(p => p.InstanceId == instanceId && p.ExpiresAtUtc <= createdAtUtc)
|
||||
.ToListAsync(cancellationToken);
|
||||
if (expired.Count > 0)
|
||||
{
|
||||
return false;
|
||||
_dbContext.Set<PendingDeployment>().RemoveRange(expired);
|
||||
}
|
||||
|
||||
var pending = new PendingDeployment(
|
||||
deploymentId, instanceId, revisionHash,
|
||||
configurationJson, token, createdAtUtc, expiresAtUtc);
|
||||
await _dbContext.Set<PendingDeployment>().AddAsync(pending, cancellationToken);
|
||||
// THEN insert-if-absent against still-LIVE rows only. A live pending row means a genuine
|
||||
// in-flight deploy (or a concurrent reconcile) already owns the slot — do NOT supersede it;
|
||||
// clobbering it could make the node fetch the reconcile token while the original deliver is
|
||||
// mid-flight. (Expired rows just removed are disjoint from this future-expiry predicate.)
|
||||
var liveExists = await _dbContext.Set<PendingDeployment>()
|
||||
.AnyAsync(p => p.InstanceId == instanceId && p.ExpiresAtUtc > createdAtUtc, cancellationToken);
|
||||
if (!liveExists)
|
||||
{
|
||||
var pending = new PendingDeployment(
|
||||
deploymentId, instanceId, revisionHash,
|
||||
configurationJson, token, createdAtUtc, expiresAtUtc);
|
||||
await _dbContext.Set<PendingDeployment>().AddAsync(pending, cancellationToken);
|
||||
}
|
||||
|
||||
// Self-contained: one SaveChanges flushes the expired-row cleanup and, when staged, the new
|
||||
// row together (EF orders the delete before the same-DeploymentId insert to satisfy the
|
||||
// unique index). Returns true only when a fresh row was staged.
|
||||
await _dbContext.SaveChangesAsync(cancellationToken);
|
||||
return true;
|
||||
return !liveExists;
|
||||
}
|
||||
|
||||
// --- Instance lookups for deployment pipeline ---
|
||||
|
||||
Reference in New Issue
Block a user