fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR
CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the at-least-once livelock on the site→central notification handoff. DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent same-tag subscribes don't orphan an adapter handle; delete the latent dead _subscriptionHandles dictionary; stop double-counting _totalSubscribed when an unresolved tag is promoted via another instance; release adapter handles on mid-flight unsubscribe; gate the tag-resolution retry timer with IsTimerActive so subscribe bursts don't reset it into starvation. SR-020: add _terminatingActorsByName shadow so a third deploy arriving during a pending redeploy doesn't crash on InvalidActorNameException — displaced senders get a Failed/superseded response and the latest command wins on Terminated. SR-024: split OperationTrackingStore reads from writes (fresh SqliteConnection per GetStatusAsync) so long writes don't block status queries; rewrite Dispose to drop the sync-over-async bridge that could deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the dispose-once flag race-safe across both paths.
This commit is contained in:
@@ -125,6 +125,65 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable
|
||||
Assert.True(disable.Success);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR020_ThreeRapidDeploys_DoNotThrowInvalidActorNameException_LatestWins()
|
||||
{
|
||||
// Regression test for SiteRuntime-020. The previous implementation tracked
|
||||
// pending redeploys by IActorRef (_pendingRedeploys) but had no
|
||||
// name-keyed shadow, so a third DeployInstanceCommand arriving WHILE the
|
||||
// first redeploy's predecessor was still terminating saw
|
||||
// _instanceActors.TryGetValue==false and fell through to
|
||||
// ApplyDeployment → CreateInstanceActor → Context.ActorOf, which threw
|
||||
// InvalidActorNameException because the child name was still registered
|
||||
// until Terminated fires. The supervisor's Stop directive then silently
|
||||
// dropped the deploy, leaving the deployer waiting forever and the
|
||||
// persistence Task.Run dangling. After the fix, _terminatingActorsByName
|
||||
// tracks the in-flight terminator by name; the third deploy overwrites
|
||||
// the buffered pending command (last-write-wins) and tells the displaced
|
||||
// sender it was superseded.
|
||||
var actor = CreateDeploymentManager();
|
||||
await Task.Delay(500);
|
||||
|
||||
// Initial deploy — establishes the running instance.
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-1", "RapidPump", "h1", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow));
|
||||
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
||||
Assert.Equal(DeploymentStatus.Success, first.Status);
|
||||
await Task.Delay(200);
|
||||
|
||||
// Two rapid redeploys before the predecessor has time to fully terminate.
|
||||
// The second deploy stops the actor (watching it) and buffers itself.
|
||||
// The third deploy arrives almost immediately and must NOT crash — it
|
||||
// overwrites the buffered pending command and tells dep-2 it was superseded.
|
||||
var probe2 = CreateTestProbe();
|
||||
var probe3 = CreateTestProbe();
|
||||
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-2", "RapidPump", "h2", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
||||
probe2.Ref);
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-3", "RapidPump", "h3", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
||||
probe3.Ref);
|
||||
|
||||
// dep-2 must be told it was superseded; dep-3 must succeed once the
|
||||
// predecessor finishes terminating.
|
||||
var superseded = probe2.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
||||
Assert.Equal("dep-2", superseded.DeploymentId);
|
||||
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
|
||||
Assert.NotNull(superseded.ErrorMessage);
|
||||
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
var winner = probe3.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
||||
Assert.Equal("dep-3", winner.DeploymentId);
|
||||
Assert.Equal(DeploymentStatus.Success, winner.Status);
|
||||
|
||||
// The instance must still be operable — proves no orphaned actor / no
|
||||
// half-created child holding the name.
|
||||
actor.Tell(new DisableInstanceCommand("cmd-1", "RapidPump", DateTimeOffset.UtcNow));
|
||||
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
||||
Assert.True(disable.Success);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user