diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs index 9824e1d3..d3ca5e2d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Repositories/IDeploymentManagerRepository.cs @@ -157,6 +157,20 @@ public interface IDeploymentManagerRepository /// The pending deployment, or null if not found. Task GetPendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default); /// + /// Gets the pending deployment staged for a given instance (there is at most one per instance + /// by design — supersedes and + /// inserts only if absent). Used by startup reconcile: + /// when reports a row already exists (a concurrent + /// reconcile from the other node, or an in-flight deploy), the handler reads that existing row + /// so a second concurrently-missing node can fetch the same pending config with its (multi-use, + /// TTL-bound) token instead of being omitted from the gap. Defensive against >1 row: returns + /// the most recently created one. + /// + /// The instance whose pending deployment to read. + /// A cancellation token that can be used to cancel the operation. + /// The pending deployment for the instance, or null if none is staged. + Task GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default); + /// /// Deletes a pending deployment by its deployment ID. No-op if not found. Does NOT /// call — the caller commits, mirroring the other /// Add/Delete repository methods. diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs index 3278ed30..c1f0f99a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/ReconcileService.cs @@ -112,11 +112,12 @@ public class ReconcileService // Stage with the snapshot's DeploymentId as the deploymentId so the gap item's // DeploymentId + token point the node at the right pending row to fetch. // - // Reconcile staging is safe without a DB uniqueness guard: a gap arises only - // from one-node-down-during-a-successful-deploy, so at most one node ever - // reconciles a given instance (if BOTH were down the deploy failed and no - // snapshot exists, so it is never in the expected set). Deploy-time - // supersession serializes via the per-instance operation lock. + // StagePendingIfAbsent is insert-if-absent: if BOTH site nodes are concurrently + // missing the same instance (e.g. fresh container start / cleared SQLite after a + // successful deploy), both attempt to stage here. The first succeeds (true); the + // second gets false and is handled in the !staged branch below — it returns the + // existing pending row's token so it heals in the same round, rather than being + // omitted. Deploy-time supersession serializes via the per-instance operation lock. var staged = await _deploymentRepository.StagePendingIfAbsentAsync( exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash, snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken) @@ -124,12 +125,35 @@ public class ReconcileService if (!staged) { - // A pending row already exists — an in-flight deploy is mid-flight and its - // replication will deliver this instance to the node shortly. Omit it from - // the gap (reconcile is best-effort and re-runs). - _logger.LogDebug( - "Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap", - exp.InstanceUniqueName); + // A pending row already exists for this instance — either a CONCURRENT reconcile + // from the other site node (both nodes' SQLite empty after a fresh/cleared deploy, + // both reconciling at startup) or an in-flight deploy. Do NOT omit the item: if we + // did, the second concurrently-missing node would get 0 fetched and stay unhealed + // until a later restart. Instead, read the EXISTING pending row and emit a gap item + // carrying ITS DeploymentId/RevisionHash/Token. The fetch token is multi-use within + // its TTL, so both nodes fetch the same pending config and heal in the same round. + // (If the existing row is from an in-flight deploy its config is newer than the + // snapshot — fetching it is still correct; the site's guarded write handles ordering.) + var existing = await _deploymentRepository + .GetPendingDeploymentByInstanceIdAsync(exp.InstanceId, cancellationToken) + .ConfigureAwait(false); + if (existing != null) + { + _logger.LogDebug( + "Reconcile: pending row already exists for instance {Instance} (concurrent reconcile or in-flight deploy); returning existing token so this node heals too", + exp.InstanceUniqueName); + gap.Add(new ReconcileGapItem( + exp.InstanceUniqueName, existing.DeploymentId, existing.RevisionHash, + exp.IsEnabled, existing.Token)); + } + else + { + // Raced away: the pending row was purged between the stage attempt and this + // read. Omit it — reconcile is best-effort and the node retries next round. + _logger.LogDebug( + "Reconcile: pending row for instance {Instance} disappeared between stage and read (purged race); omitting from gap", + exp.InstanceUniqueName); + } continue; } diff --git a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs index 58f3e96d..050a088d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/Repositories/DeploymentManagerRepository.cs @@ -226,6 +226,19 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository .FirstOrDefaultAsync(p => p.DeploymentId == deploymentId, cancellationToken); } + /// + public Task GetPendingDeploymentByInstanceIdAsync(int instanceId, CancellationToken cancellationToken = default) + { + // At most one pending row per instance by design (supersession + stage-if-absent), + // but order deterministically and take the most recent so a hypothetical duplicate + // never makes the read non-deterministic — mirrors GetCurrentDeploymentStatusAsync. + return _dbContext.Set() + .Where(p => p.InstanceId == instanceId) + .OrderByDescending(p => p.CreatedAtUtc) + .ThenByDescending(p => p.Id) + .FirstOrDefaultAsync(cancellationToken); + } + /// public async Task DeletePendingDeploymentByIdAsync(string deploymentId, CancellationToken cancellationToken = default) { diff --git a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs index c3894242..1943ef8d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/ReconcileServiceTests.cs @@ -117,8 +117,12 @@ public class ReconcileServiceTests } [Fact] - public async Task Reconcile_StagePendingReturnsFalse_OmitsThatGapItem() + public async Task Reconcile_StagePendingReturnsFalse_ExistingRowPresent_ReturnsExistingTokenInGap() { + // Live-found bug: two site nodes concurrently missing the SAME instance both reconcile at + // startup. The first stages a pending row; the second's StagePendingIfAbsent returns false. + // The second node must STILL heal — the handler reads the existing pending row and returns + // ITS deploymentId/revHash/token (the fetch token is multi-use within its TTL). SiteResolves(); var b = Expected(2, "inst-B", "rev2", "dep-B"); var c = Expected(3, "inst-C", "rev3", "dep-C"); @@ -126,7 +130,8 @@ public class ReconcileServiceTests SnapshotFor(b); SnapshotFor(c); - // Both missing locally, but C already has an in-flight pending row. + // B stages fresh (true). C already has a pending row (a concurrent reconcile from the + // other node, or an in-flight deploy, staged it first) → stage returns false. StageReturns(true); _deploymentRepo.StagePendingIfAbsentAsync( 3, Arg.Any(), Arg.Any(), Arg.Any(), @@ -134,6 +139,52 @@ public class ReconcileServiceTests Arg.Any()) .Returns(false); + // The existing pending row for inst-C carries the FIRST node's deploymentId + token. + var nowUtc = DateTimeOffset.UtcNow; + _deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any()) + .Returns(new PendingDeployment( + "dep-C-existing", 3, "rev3-existing", "{\"cfg\":\"inst-C\"}", + "tok-C-existing", nowUtc, nowUtc.AddMinutes(5))); + + var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */)); + + Assert.Equal(2, response.Gap.Count); + + var gapB = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-B"); + Assert.Equal("dep-B", gapB.DeploymentId); + + // inst-C IS included, carrying the EXISTING row's deploymentId/revHash/token — NOT the + // snapshot's (snapshot.DeploymentId would be "dep-C"). + var gapC = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-C"); + Assert.Equal("dep-C-existing", gapC.DeploymentId); + Assert.Equal("rev3-existing", gapC.RevisionHash); + Assert.Equal("tok-C-existing", gapC.FetchToken); + Assert.True(gapC.IsEnabled); + } + + [Fact] + public async Task Reconcile_StagePendingReturnsFalse_NoExistingRow_OmitsThatGapItem() + { + // Fallback path: stage returns false, but the pending row raced away (was purged between + // the stage attempt and the read). Omit the item — the node retries on the next reconcile. + SiteResolves(); + var b = Expected(2, "inst-B", "rev2", "dep-B"); + var c = Expected(3, "inst-C", "rev3", "dep-C"); + ExpectedSet(b, c); + SnapshotFor(b); + SnapshotFor(c); + + StageReturns(true); + _deploymentRepo.StagePendingIfAbsentAsync( + 3, Arg.Any(), Arg.Any(), Arg.Any(), + Arg.Any(), Arg.Any(), Arg.Any(), + Arg.Any()) + .Returns(false); + + // The pending row was purged between the stage attempt and the read. + _deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any()) + .Returns((PendingDeployment?)null); + var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */)); var gap = Assert.Single(response.Gap); diff --git a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs index 12fb8b48..093789bf 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.Tests/PendingDeploymentRepositoryTests.cs @@ -159,6 +159,36 @@ public class PendingDeploymentRepositoryTests : IDisposable Assert.Null(await _repository.GetPendingDeploymentByIdAsync("does-not-exist")); } + [Fact] + public async Task GetPendingDeploymentByInstanceId_ReturnsRow_WhenPresent() + { + // Startup-reconcile read path: when StagePendingIfAbsent reports a row already exists + // (concurrent reconcile from the other node, or an in-flight deploy), the handler reads + // the existing row by instance id to hand the second node the same fetch token. + var instanceId = await SeedInstanceAsync("Inst9"); + await _repository.AddPendingDeploymentAsync(NewPending("dep9", instanceId, "{\"v\":9}")); + await _repository.SaveChangesAsync(); + + var row = await _repository.GetPendingDeploymentByInstanceIdAsync(instanceId); + + Assert.NotNull(row); + Assert.Equal("dep9", row!.DeploymentId); + Assert.Equal(instanceId, row.InstanceId); + Assert.Equal("tok-dep9", row.Token); + Assert.Equal("{\"v\":9}", row.ConfigurationJson); + } + + [Fact] + public async Task GetPendingDeploymentByInstanceId_ReturnsNull_WhenAbsent() + { + // No pending row staged for this instance → null (the reconcile fallback/omit path). + var instanceId = await SeedInstanceAsync("Inst10"); + + var row = await _repository.GetPendingDeploymentByInstanceIdAsync(instanceId); + + Assert.Null(row); + } + [Fact] public async Task AddPendingDeployment_MultiplePriorRowsSameInstance_AllSuperseded() {