fix(reconcile): heal all concurrently-missing nodes — return existing pending token instead of omitting
This commit is contained in:
@@ -117,8 +117,12 @@ public class ReconcileServiceTests
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Reconcile_StagePendingReturnsFalse_OmitsThatGapItem()
|
||||
public async Task Reconcile_StagePendingReturnsFalse_ExistingRowPresent_ReturnsExistingTokenInGap()
|
||||
{
|
||||
// Live-found bug: two site nodes concurrently missing the SAME instance both reconcile at
|
||||
// startup. The first stages a pending row; the second's StagePendingIfAbsent returns false.
|
||||
// The second node must STILL heal — the handler reads the existing pending row and returns
|
||||
// ITS deploymentId/revHash/token (the fetch token is multi-use within its TTL).
|
||||
SiteResolves();
|
||||
var b = Expected(2, "inst-B", "rev2", "dep-B");
|
||||
var c = Expected(3, "inst-C", "rev3", "dep-C");
|
||||
@@ -126,7 +130,8 @@ public class ReconcileServiceTests
|
||||
SnapshotFor(b);
|
||||
SnapshotFor(c);
|
||||
|
||||
// Both missing locally, but C already has an in-flight pending row.
|
||||
// B stages fresh (true). C already has a pending row (a concurrent reconcile from the
|
||||
// other node, or an in-flight deploy, staged it first) → stage returns false.
|
||||
StageReturns(true);
|
||||
_deploymentRepo.StagePendingIfAbsentAsync(
|
||||
3, Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
|
||||
@@ -134,6 +139,52 @@ public class ReconcileServiceTests
|
||||
Arg.Any<CancellationToken>())
|
||||
.Returns(false);
|
||||
|
||||
// The existing pending row for inst-C carries the FIRST node's deploymentId + token.
|
||||
var nowUtc = DateTimeOffset.UtcNow;
|
||||
_deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any<CancellationToken>())
|
||||
.Returns(new PendingDeployment(
|
||||
"dep-C-existing", 3, "rev3-existing", "{\"cfg\":\"inst-C\"}",
|
||||
"tok-C-existing", nowUtc, nowUtc.AddMinutes(5)));
|
||||
|
||||
var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */));
|
||||
|
||||
Assert.Equal(2, response.Gap.Count);
|
||||
|
||||
var gapB = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-B");
|
||||
Assert.Equal("dep-B", gapB.DeploymentId);
|
||||
|
||||
// inst-C IS included, carrying the EXISTING row's deploymentId/revHash/token — NOT the
|
||||
// snapshot's (snapshot.DeploymentId would be "dep-C").
|
||||
var gapC = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-C");
|
||||
Assert.Equal("dep-C-existing", gapC.DeploymentId);
|
||||
Assert.Equal("rev3-existing", gapC.RevisionHash);
|
||||
Assert.Equal("tok-C-existing", gapC.FetchToken);
|
||||
Assert.True(gapC.IsEnabled);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Reconcile_StagePendingReturnsFalse_NoExistingRow_OmitsThatGapItem()
|
||||
{
|
||||
// Fallback path: stage returns false, but the pending row raced away (was purged between
|
||||
// the stage attempt and the read). Omit the item — the node retries on the next reconcile.
|
||||
SiteResolves();
|
||||
var b = Expected(2, "inst-B", "rev2", "dep-B");
|
||||
var c = Expected(3, "inst-C", "rev3", "dep-C");
|
||||
ExpectedSet(b, c);
|
||||
SnapshotFor(b);
|
||||
SnapshotFor(c);
|
||||
|
||||
StageReturns(true);
|
||||
_deploymentRepo.StagePendingIfAbsentAsync(
|
||||
3, Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
|
||||
Arg.Any<string>(), Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(),
|
||||
Arg.Any<CancellationToken>())
|
||||
.Returns(false);
|
||||
|
||||
// The pending row was purged between the stage attempt and the read.
|
||||
_deploymentRepo.GetPendingDeploymentByInstanceIdAsync(3, Arg.Any<CancellationToken>())
|
||||
.Returns((PendingDeployment?)null);
|
||||
|
||||
var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */));
|
||||
|
||||
var gap = Assert.Single(response.Gap);
|
||||
|
||||
Reference in New Issue
Block a user