fix(reconcile): heal all concurrently-missing nodes — return existing pending token instead of omitting

This commit is contained in:
Joseph Doherty
2026-06-26 17:09:42 -04:00
parent 99254b71de
commit 6538216b0c
5 changed files with 145 additions and 13 deletions
@@ -157,6 +157,20 @@ public interface IDeploymentManagerRepository
/// <returns>The pending deployment, or null if not found.</returns>
Task<PendingDeployment?> GetPendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets the pending deployment staged for a given instance (there is at most one per instance
/// by design — <see cref="AddPendingDeploymentAsync"/> supersedes and
/// <see cref="StagePendingIfAbsentAsync"/> inserts only if absent). Used by startup reconcile:
/// when <see cref="StagePendingIfAbsentAsync"/> reports a row already exists (a concurrent
/// reconcile from the other node, or an in-flight deploy), the handler reads that existing row
/// so a second concurrently-missing node can fetch the same pending config with its (multi-use,
/// TTL-bound) token instead of being omitted from the gap. Defensive against >1 row: returns
/// the most recently created one.
/// </summary>
/// <param name="instanceId">The instance whose pending deployment to read.</param>
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
/// <returns>The pending deployment for the instance, or null if none is staged.</returns>
Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default);
/// <summary>
/// Deletes a pending deployment by its deployment ID. No-op if not found. Does NOT
/// call <see cref="SaveChangesAsync"/> — the caller commits, mirroring the other
/// Add/Delete repository methods.
@@ -112,11 +112,12 @@ public class ReconcileService
// Stage with the snapshot's DeploymentId as the deploymentId so the gap item's
// DeploymentId + token point the node at the right pending row to fetch.
//
// Reconcile staging is safe without a DB uniqueness guard: a gap arises only
// from one-node-down-during-a-successful-deploy, so at most one node ever
// reconciles a given instance (if BOTH were down the deploy failed and no
// snapshot exists, so it is never in the expected set). Deploy-time
// supersession serializes via the per-instance operation lock.
// StagePendingIfAbsent is insert-if-absent: if BOTH site nodes are concurrently
// missing the same instance (e.g. fresh container start / cleared SQLite after a
// successful deploy), both attempt to stage here. The first succeeds (true); the
// second gets false and is handled in the !staged branch below — it returns the
// existing pending row's token so it heals in the same round, rather than being
// omitted. Deploy-time supersession serializes via the per-instance operation lock.
var staged = await _deploymentRepository.StagePendingIfAbsentAsync(
exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash,
snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken)
@@ -124,12 +125,35 @@ public class ReconcileService
if (!staged)
{
// A pending row already exists — an in-flight deploy is mid-flight and its
// replication will deliver this instance to the node shortly. Omit it from
// the gap (reconcile is best-effort and re-runs).
_logger.LogDebug(
"Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap",
exp.InstanceUniqueName);
// A pending row already exists for this instance — either a CONCURRENT reconcile
// from the other site node (both nodes' SQLite empty after a fresh/cleared deploy,
// both reconciling at startup) or an in-flight deploy. Do NOT omit the item: if we
// did, the second concurrently-missing node would get 0 fetched and stay unhealed
// until a later restart. Instead, read the EXISTING pending row and emit a gap item
// carrying ITS DeploymentId/RevisionHash/Token. The fetch token is multi-use within
// its TTL, so both nodes fetch the same pending config and heal in the same round.
// (If the existing row is from an in-flight deploy its config is newer than the
// snapshot — fetching it is still correct; the site's guarded write handles ordering.)
var existing = await _deploymentRepository
.GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, cancellationToken)
.ConfigureAwait(false);
if (existing != null)
{
_logger.LogDebug(
"Reconcile: pending row already exists for instance {Instance} (concurrent reconcile or in-flight deploy); returning existing token so this node heals too",
exp.InstanceUniqueName);
gap.Add(new ReconcileGapItem(
exp.InstanceUniqueName, existing.DeploymentId, existing.RevisionHash,
exp.IsEnabled, existing.Token));
}
else
{
// Raced away: the pending row was purged between the stage attempt and this
// read. Omit it — reconcile is best-effort and the node retries next round.
_logger.LogDebug(
"Reconcile: pending row for instance {Instance} disappeared between stage and read (purged race); omitting from gap",
exp.InstanceUniqueName);
}
continue;
}
@@ -226,6 +226,19 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
.FirstOrDefaultAsync(p => p.DeploymentId == deploymentId, cancellationToken);
}
/// <inheritdoc />
public Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default)
{
// At most one pending row per instance by design (supersession + stage-if-absent),
// but order deterministically and take the most recent so a hypothetical duplicate
// never makes the read non-deterministic — mirrors GetCurrentDeploymentStatusAsync.
return _dbContext.Set<PendingDeployment>()
.Where(p => p.InstanceId == instanceId)
.OrderByDescending(p => p.CreatedAtUtc)
.ThenByDescending(p => p.Id)
.FirstOrDefaultAsync(cancellationToken);
}
/// <inheritdoc />
public async Task DeletePendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default)
{