fix(reconcile): heal all concurrently-missing nodes — return existing pending token instead of omitting

2026-06-26 17:09:42 -04:00
parent 99254b71de
commit 6538216b0c
5 changed files with 145 additions and 13 deletions
@@ -112,11 +112,12 @@ public class ReconcileService
            // Stage with the snapshot's DeploymentId as the deploymentId so the gap item's
            // DeploymentId + token point the node at the right pending row to fetch.
            //
-            // Reconcile staging is safe without a DB uniqueness guard: a gap arises only
-            // from one-node-down-during-a-successful-deploy, so at most one node ever
-            // reconciles a given instance (if BOTH were down the deploy failed and no
-            // snapshot exists, so it is never in the expected set). Deploy-time
-            // supersession serializes via the per-instance operation lock.
+            // StagePendingIfAbsent is insert-if-absent: if BOTH site nodes are concurrently
+            // missing the same instance (e.g. fresh container start / cleared SQLite after a
+            // successful deploy), both attempt to stage here. The first succeeds (true); the
+            // second gets false and is handled in the !staged branch below — it returns the
+            // existing pending row's token so it heals in the same round, rather than being
+            // omitted. Deploy-time supersession serializes via the per-instance operation lock.
            var staged = await _deploymentRepository.StagePendingIfAbsentAsync(
                exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash,
                snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken)
@@ -124,12 +125,35 @@ public class ReconcileService

            if (!staged)
            {
-                // A pending row already exists — an in-flight deploy is mid-flight and its
-                // replication will deliver this instance to the node shortly. Omit it from
-                // the gap (reconcile is best-effort and re-runs).
-                _logger.LogDebug(
-                    "Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap",
-                    exp.InstanceUniqueName);
+                // A pending row already exists for this instance — either a CONCURRENT reconcile
+                // from the other site node (both nodes' SQLite empty after a fresh/cleared deploy,
+                // both reconciling at startup) or an in-flight deploy. Do NOT omit the item: if we
+                // did, the second concurrently-missing node would get 0 fetched and stay unhealed
+                // until a later restart. Instead, read the EXISTING pending row and emit a gap item
+                // carrying ITS DeploymentId/RevisionHash/Token. The fetch token is multi-use within
+                // its TTL, so both nodes fetch the same pending config and heal in the same round.
+                // (If the existing row is from an in-flight deploy its config is newer than the
+                // snapshot — fetching it is still correct; the site's guarded write handles ordering.)
+                var existing = await _deploymentRepository
+                    .GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, cancellationToken)
+                    .ConfigureAwait(false);
+                if (existing != null)
+                {
+                    _logger.LogDebug(
+                        "Reconcile: pending row already exists for instance {Instance} (concurrent reconcile or in-flight deploy); returning existing token so this node heals too",
+                        exp.InstanceUniqueName);
+                    gap.Add(new ReconcileGapItem(
+                        exp.InstanceUniqueName, existing.DeploymentId, existing.RevisionHash,
+                        exp.IsEnabled, existing.Token));
+                }
+                else
+                {
+                    // Raced away: the pending row was purged between the stage attempt and this
+                    // read. Omit it — reconcile is best-effort and the node retries next round.
+                    _logger.LogDebug(
+                        "Reconcile: pending row for instance {Instance} disappeared between stage and read (purged race); omitting from gap",
+                        exp.InstanceUniqueName);
+                }
                continue;
            }