fix(reconcile): heal all concurrently-missing nodes — return existing pending token instead of omitting

This commit is contained in:
Joseph Doherty
2026-06-26 17:09:42 -04:00
parent 99254b71de
commit 6538216b0c
5 changed files with 145 additions and 13 deletions
@@ -157,6 +157,20 @@ public interface IDeploymentManagerRepository
/// <returns>The pending deployment, or null if not found.</returns>
Task<PendingDeployment?> GetPendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets the pending deployment staged for a given instance (there is at most one per instance
/// by design — <see cref="AddPendingDeploymentAsync"/> supersedes and
/// <see cref="StagePendingIfAbsentAsync"/> inserts only if absent). Used by startup reconcile:
/// when <see cref="StagePendingIfAbsentAsync"/> reports a row already exists (a concurrent
/// reconcile from the other node, or an in-flight deploy), the handler reads that existing row
/// so a second concurrently-missing node can fetch the same pending config with its (multi-use,
/// TTL-bound) token instead of being omitted from the gap. Defensive against >1 row: returns
/// the most recently created one.
/// </summary>
/// <param name="instanceId">The instance whose pending deployment to read.</param>
/// <param name="cancellationToken">A cancellation token that can be used to cancel the operation.</param>
/// <returns>The pending deployment for the instance, or null if none is staged.</returns>
Task<PendingDeployment?> GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default);
/// <summary>
/// Deletes a pending deployment by its deployment ID. No-op if not found. Does NOT
/// call <see cref="SaveChangesAsync"/> — the caller commits, mirroring the other
/// Add/Delete repository methods.