fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR

CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL
IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the
at-least-once livelock on the site→central notification handoff.

DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent
same-tag subscribes don't orphan an adapter handle; delete the latent
dead _subscriptionHandles dictionary; stop double-counting
_totalSubscribed when an unresolved tag is promoted via another instance;
release adapter handles on mid-flight unsubscribe; gate the
tag-resolution retry timer with IsTimerActive so subscribe bursts don't
reset it into starvation.

SR-020: add _terminatingActorsByName shadow so a third deploy arriving
during a pending redeploy doesn't crash on InvalidActorNameException —
displaced senders get a Failed/superseded response and the latest
command wins on Terminated.

SR-024: split OperationTrackingStore reads from writes (fresh
SqliteConnection per GetStatusAsync) so long writes don't block status
queries; rewrite Dispose to drop the sync-over-async bridge that could
deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the
dispose-once flag race-safe across both paths.
This commit is contained in:
Joseph Doherty
2026-05-28 05:20:13 -04:00
parent 5d2386cc9d
commit f936f55f51
15 changed files with 1152 additions and 170 deletions
@@ -125,6 +125,65 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable
Assert.True(disable.Success);
}
[Fact]
public async Task SR020_ThreeRapidDeploys_DoNotThrowInvalidActorNameException_LatestWins()
{
// Regression test for SiteRuntime-020. The previous implementation tracked
// pending redeploys by IActorRef (_pendingRedeploys) but had no
// name-keyed shadow, so a third DeployInstanceCommand arriving WHILE the
// first redeploy's predecessor was still terminating saw
// _instanceActors.TryGetValue==false and fell through to
// ApplyDeployment → CreateInstanceActor → Context.ActorOf, which threw
// InvalidActorNameException because the child name was still registered
// until Terminated fires. The supervisor's Stop directive then silently
// dropped the deploy, leaving the deployer waiting forever and the
// persistence Task.Run dangling. After the fix, _terminatingActorsByName
// tracks the in-flight terminator by name; the third deploy overwrites
// the buffered pending command (last-write-wins) and tells the displaced
// sender it was superseded.
var actor = CreateDeploymentManager();
await Task.Delay(500);
// Initial deploy — establishes the running instance.
actor.Tell(new DeployInstanceCommand(
"dep-1", "RapidPump", "h1", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow));
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
Assert.Equal(DeploymentStatus.Success, first.Status);
await Task.Delay(200);
// Two rapid redeploys before the predecessor has time to fully terminate.
// The second deploy stops the actor (watching it) and buffers itself.
// The third deploy arrives almost immediately and must NOT crash — it
// overwrites the buffered pending command and tells dep-2 it was superseded.
var probe2 = CreateTestProbe();
var probe3 = CreateTestProbe();
actor.Tell(new DeployInstanceCommand(
"dep-2", "RapidPump", "h2", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
probe2.Ref);
actor.Tell(new DeployInstanceCommand(
"dep-3", "RapidPump", "h3", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
probe3.Ref);
// dep-2 must be told it was superseded; dep-3 must succeed once the
// predecessor finishes terminating.
var superseded = probe2.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
Assert.Equal("dep-2", superseded.DeploymentId);
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
Assert.NotNull(superseded.ErrorMessage);
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
var winner = probe3.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
Assert.Equal("dep-3", winner.DeploymentId);
Assert.Equal(DeploymentStatus.Success, winner.Status);
// The instance must still be operable — proves no orphaned actor / no
// half-created child holding the name.
actor.Tell(new DisableInstanceCommand("cmd-1", "RapidPump", DateTimeOffset.UtcNow));
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
Assert.True(disable.Success);
}
[Fact]
public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances()
{