diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs
index 9824e1d3..d3ca5e2d 100644
--- a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs
+++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs
@@ -157,6 +157,20 @@ public interface IDeploymentManagerRepository
/// The pending deployment, or null if not found.
Task GetPendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default);
///
+ /// Gets the pending deployment staged for a given instance (there is at most one per instance
+ /// by design — supersedes and
+ /// inserts only if absent). Used by startup reconcile:
+ /// when reports a row already exists (a concurrent
+ /// reconcile from the other node, or an in-flight deploy), the handler reads that existing row
+ /// so a second concurrently-missing node can fetch the same pending config with its (multi-use,
+ /// TTL-bound) token instead of being omitted from the gap. Defensive against >1 row: returns
+ /// the most recently created one.
+ ///
+ /// The instance whose pending deployment to read.
+ /// A cancellation token that can be used to cancel the operation.
+ /// The pending deployment for the instance, or null if none is staged.
+ Task GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default);
+ ///
/// Deletes a pending deployment by its deployment ID. No-op if not found. Does NOT
/// call — the caller commits, mirroring the other
/// Add/Delete repository methods.
diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs
index 3278ed30..c1f0f99a 100644
--- a/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs
+++ b/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs
@@ -112,11 +112,12 @@ public class ReconcileService
// Stage with the snapshot's DeploymentId as the deploymentId so the gap item's
// DeploymentId + token point the node at the right pending row to fetch.
//
- // Reconcile staging is safe without a DB uniqueness guard: a gap arises only
- // from one-node-down-during-a-successful-deploy, so at most one node ever
- // reconciles a given instance (if BOTH were down the deploy failed and no
- // snapshot exists, so it is never in the expected set). Deploy-time
- // supersession serializes via the per-instance operation lock.
+ // StagePendingIfAbsent is insert-if-absent: if BOTH site nodes are concurrently
+ // missing the same instance (e.g. fresh container start / cleared SQLite after a
+ // successful deploy), both attempt to stage here. The first succeeds (true); the
+ // second gets false and is handled in the !staged branch below — it returns the
+ // existing pending row's token so it heals in the same round, rather than being
+ // omitted. Deploy-time supersession serializes via the per-instance operation lock.
var staged = await _deploymentRepository.StagePendingIfAbsentAsync(
exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash,
snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken)
@@ -124,12 +125,35 @@ public class ReconcileService
if (!staged)
{
- // A pending row already exists — an in-flight deploy is mid-flight and its
- // replication will deliver this instance to the node shortly. Omit it from
- // the gap (reconcile is best-effort and re-runs).
- _logger.LogDebug(
- "Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap",
- exp.InstanceUniqueName);
+ // A pending row already exists for this instance — either a CONCURRENT reconcile
+ // from the other site node (both nodes' SQLite empty after a fresh/cleared deploy,
+ // both reconciling at startup) or an in-flight deploy. Do NOT omit the item: if we
+ // did, the second concurrently-missing node would get 0 fetched and stay unhealed
+ // until a later restart. Instead, read the EXISTING pending row and emit a gap item
+ // carrying ITS DeploymentId/RevisionHash/Token. The fetch token is multi-use within
+ // its TTL, so both nodes fetch the same pending config and heal in the same round.
+ // (If the existing row is from an in-flight deploy its config is newer than the
+ // snapshot — fetching it is still correct; the site's guarded write handles ordering.)
+ var existing = await _deploymentRepository
+ .GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, cancellationToken)
+ .ConfigureAwait(false);
+ if (existing != null)
+ {
+ _logger.LogDebug(
+ "Reconcile: pending row already exists for instance {Instance} (concurrent reconcile or in-flight deploy); returning existing token so this node heals too",
+ exp.InstanceUniqueName);
+ gap.Add(new ReconcileGapItem(
+ exp.InstanceUniqueName, existing.DeploymentId, existing.RevisionHash,
+ exp.IsEnabled, existing.Token));
+ }
+ else
+ {
+ // Raced away: the pending row was purged between the stage attempt and this
+ // read. Omit it — reconcile is best-effort and the node retries next round.
+ _logger.LogDebug(
+ "Reconcile: pending row for instance {Instance} disappeared between stage and read (purged race); omitting from gap",
+ exp.InstanceUniqueName);
+ }
continue;
}
diff --git a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs
index 58f3e96d..050a088d 100644
--- a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs
+++ b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs
@@ -226,6 +226,19 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
.FirstOrDefaultAsync(p => p.DeploymentId == deploymentId, cancellationToken);
}
+ ///
+ public Task GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default)
+ {
+ // At most one pending row per instance by design (supersession + stage-if-absent),
+ // but order deterministically and take the most recent so a hypothetical duplicate
+ // never makes the read non-deterministic — mirrors GetCurrentDeploymentStatusAsync.
+ return _dbContext.Set()
+ .Where(p => p.InstanceId == instanceId)
+ .OrderByDescending(p => p.CreatedAtUtc)
+ .ThenByDescending(p => p.Id)
+ .FirstOrDefaultAsync(cancellationToken);
+ }
+
///
public async Task DeletePendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default)
{
diff --git a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs
index c3894242..1943ef8d 100644
--- a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs
+++ b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs
@@ -117,8 +117,12 @@ public class ReconcileServiceTests
}
[Fact]
- public async Task Reconcile_StagePendingReturnsFalse_OmitsThatGapItem()
+ public async Task Reconcile_StagePendingReturnsFalse_ExistingRowPresent_ReturnsExistingTokenInGap()
{
+ // Live-found bug: two site nodes concurrently missing the SAME instance both reconcile at
+ // startup. The first stages a pending row; the second's StagePendingIfAbsent returns false.
+ // The second node must STILL heal — the handler reads the existing pending row and returns
+ // ITS deploymentId/revHash/token (the fetch token is multi-use within its TTL).
SiteResolves();
var b = Expected(2, "inst-B", "rev2", "dep-B");
var c = Expected(3, "inst-C", "rev3", "dep-C");
@@ -126,7 +130,8 @@ public class ReconcileServiceTests
SnapshotFor(b);
SnapshotFor(c);
- // Both missing locally, but C already has an in-flight pending row.
+ // B stages fresh (true). C already has a pending row (a concurrent reconcile from the
+ // other node, or an in-flight deploy, staged it first) → stage returns false.
StageReturns(true);
_deploymentRepo.StagePendingIfAbsentAsync(
3, Arg.Any(), Arg.Any(), Arg.Any(),
@@ -134,6 +139,52 @@ public class ReconcileServiceTests
Arg.Any())
.Returns(false);
+ // The existing pending row for inst-C carries the FIRST node's deploymentId + token.
+ var nowUtc = DateTimeOffset.UtcNow;
+ _deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any())
+ .Returns(new PendingDeployment(
+ "dep-C-existing", 3, "rev3-existing", "{\"cfg\":\"inst-C\"}",
+ "tok-C-existing", nowUtc, nowUtc.AddMinutes(5)));
+
+ var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */));
+
+ Assert.Equal(2, response.Gap.Count);
+
+ var gapB = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-B");
+ Assert.Equal("dep-B", gapB.DeploymentId);
+
+ // inst-C IS included, carrying the EXISTING row's deploymentId/revHash/token — NOT the
+ // snapshot's (snapshot.DeploymentId would be "dep-C").
+ var gapC = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-C");
+ Assert.Equal("dep-C-existing", gapC.DeploymentId);
+ Assert.Equal("rev3-existing", gapC.RevisionHash);
+ Assert.Equal("tok-C-existing", gapC.FetchToken);
+ Assert.True(gapC.IsEnabled);
+ }
+
+ [Fact]
+ public async Task Reconcile_StagePendingReturnsFalse_NoExistingRow_OmitsThatGapItem()
+ {
+ // Fallback path: stage returns false, but the pending row raced away (was purged between
+ // the stage attempt and the read). Omit the item — the node retries on the next reconcile.
+ SiteResolves();
+ var b = Expected(2, "inst-B", "rev2", "dep-B");
+ var c = Expected(3, "inst-C", "rev3", "dep-C");
+ ExpectedSet(b, c);
+ SnapshotFor(b);
+ SnapshotFor(c);
+
+ StageReturns(true);
+ _deploymentRepo.StagePendingIfAbsentAsync(
+ 3, Arg.Any(), Arg.Any(), Arg.Any(),
+ Arg.Any(), Arg.Any(), Arg.Any(),
+ Arg.Any())
+ .Returns(false);
+
+ // The pending row was purged between the stage attempt and the read.
+ _deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any())
+ .Returns((PendingDeployment?)null);
+
var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */));
var gap = Assert.Single(response.Gap);
diff --git a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs
index 12fb8b48..093789bf 100644
--- a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs
+++ b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs
@@ -159,6 +159,36 @@ public class PendingDeploymentRepositoryTests : IDisposable
Assert.Null(await _repository.GetPendingDeploymentByIdAsync("does-not-exist"));
}
+ [Fact]
+ public async Task GetPendingDeploymentByInstanceId_ReturnsRow_WhenPresent()
+ {
+ // Startup-reconcile read path: when StagePendingIfAbsent reports a row already exists
+ // (concurrent reconcile from the other node, or an in-flight deploy), the handler reads
+ // the existing row by instance id to hand the second node the same fetch token.
+ var instanceId = await SeedInstanceAsync("Inst9");
+ await _repository.AddPendingDeploymentAsync(NewPending("dep9", instanceId, "{\"v\":9}"));
+ await _repository.SaveChangesAsync();
+
+ var row = await _repository.GetPendingDeploymentByInstanceIdAsync(instanceId);
+
+ Assert.NotNull(row);
+ Assert.Equal("dep9", row!.DeploymentId);
+ Assert.Equal(instanceId, row.InstanceId);
+ Assert.Equal("tok-dep9", row.Token);
+ Assert.Equal("{\"v\":9}", row.ConfigurationJson);
+ }
+
+ [Fact]
+ public async Task GetPendingDeploymentByInstanceId_ReturnsNull_WhenAbsent()
+ {
+ // No pending row staged for this instance → null (the reconcile fallback/omit path).
+ var instanceId = await SeedInstanceAsync("Inst10");
+
+ var row = await _repository.GetPendingDeploymentByInstanceIdAsync(instanceId);
+
+ Assert.Null(row);
+ }
+
[Fact]
public async Task AddPendingDeployment_MultiplePriorRowsSameInstance_AllSuperseded()
{