fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR
CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the at-least-once livelock on the site→central notification handoff. DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent same-tag subscribes don't orphan an adapter handle; delete the latent dead _subscriptionHandles dictionary; stop double-counting _totalSubscribed when an unresolved tag is promoted via another instance; release adapter handles on mid-flight unsubscribe; gate the tag-resolution retry timer with IsTimerActive so subscribe bursts don't reset it into starvation. SR-020: add _terminatingActorsByName shadow so a third deploy arriving during a pending redeploy doesn't crash on InvalidActorNameException — displaced senders get a Failed/superseded response and the latest command wins on Terminated. SR-024: split OperationTrackingStore reads from writes (fresh SqliteConnection per GetStatusAsync) so long writes don't block status queries; rewrite Dispose to drop the sync-over-async bridge that could deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the dispose-once flag race-safe across both paths.
This commit is contained in:
@@ -125,6 +125,65 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable
|
||||
Assert.True(disable.Success);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR020_ThreeRapidDeploys_DoNotThrowInvalidActorNameException_LatestWins()
|
||||
{
|
||||
// Regression test for SiteRuntime-020. The previous implementation tracked
|
||||
// pending redeploys by IActorRef (_pendingRedeploys) but had no
|
||||
// name-keyed shadow, so a third DeployInstanceCommand arriving WHILE the
|
||||
// first redeploy's predecessor was still terminating saw
|
||||
// _instanceActors.TryGetValue==false and fell through to
|
||||
// ApplyDeployment → CreateInstanceActor → Context.ActorOf, which threw
|
||||
// InvalidActorNameException because the child name was still registered
|
||||
// until Terminated fires. The supervisor's Stop directive then silently
|
||||
// dropped the deploy, leaving the deployer waiting forever and the
|
||||
// persistence Task.Run dangling. After the fix, _terminatingActorsByName
|
||||
// tracks the in-flight terminator by name; the third deploy overwrites
|
||||
// the buffered pending command (last-write-wins) and tells the displaced
|
||||
// sender it was superseded.
|
||||
var actor = CreateDeploymentManager();
|
||||
await Task.Delay(500);
|
||||
|
||||
// Initial deploy — establishes the running instance.
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-1", "RapidPump", "h1", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow));
|
||||
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
||||
Assert.Equal(DeploymentStatus.Success, first.Status);
|
||||
await Task.Delay(200);
|
||||
|
||||
// Two rapid redeploys before the predecessor has time to fully terminate.
|
||||
// The second deploy stops the actor (watching it) and buffers itself.
|
||||
// The third deploy arrives almost immediately and must NOT crash — it
|
||||
// overwrites the buffered pending command and tells dep-2 it was superseded.
|
||||
var probe2 = CreateTestProbe();
|
||||
var probe3 = CreateTestProbe();
|
||||
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-2", "RapidPump", "h2", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
||||
probe2.Ref);
|
||||
actor.Tell(new DeployInstanceCommand(
|
||||
"dep-3", "RapidPump", "h3", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
|
||||
probe3.Ref);
|
||||
|
||||
// dep-2 must be told it was superseded; dep-3 must succeed once the
|
||||
// predecessor finishes terminating.
|
||||
var superseded = probe2.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
||||
Assert.Equal("dep-2", superseded.DeploymentId);
|
||||
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
|
||||
Assert.NotNull(superseded.ErrorMessage);
|
||||
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
var winner = probe3.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
|
||||
Assert.Equal("dep-3", winner.DeploymentId);
|
||||
Assert.Equal(DeploymentStatus.Success, winner.Status);
|
||||
|
||||
// The instance must still be operable — proves no orphaned actor / no
|
||||
// half-created child holding the name.
|
||||
actor.Tell(new DisableInstanceCommand("cmd-1", "RapidPump", DateTimeOffset.UtcNow));
|
||||
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
|
||||
Assert.True(disable.Success);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances()
|
||||
{
|
||||
|
||||
@@ -438,4 +438,115 @@ public class OperationTrackingStoreTests
|
||||
Assert.NotNull(await store.GetStatusAsync(bId)); // kept (recent terminal)
|
||||
Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal)
|
||||
}
|
||||
|
||||
// ── SiteRuntime-024: read/write split + sync-safe Dispose ──────────────
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_ConcurrentReads_DoNotBlockOnInFlightWrite()
|
||||
{
|
||||
// Regression test for SiteRuntime-024 (perf half). Pre-fix, every
|
||||
// GetStatusAsync took the same _gate as RecordTerminalAsync, so a single
|
||||
// long-running write would queue up every concurrent status query. After
|
||||
// the fix, reads open a fresh SqliteConnection per call and don't take
|
||||
// the write gate at all — so they should run concurrently with a write.
|
||||
//
|
||||
// The test seeds a row, then issues many parallel reads while a write is
|
||||
// also in flight. We assert the reads return successfully (a regression
|
||||
// would either deadlock the test runner or take far longer than the gate
|
||||
// would have allowed any single read). The actual timing-comparison
|
||||
// assertion would be flaky in CI; this test asserts only correctness +
|
||||
// forward progress.
|
||||
var (store, _) = CreateStore(nameof(SR024_ConcurrentReads_DoNotBlockOnInFlightWrite));
|
||||
await using (store)
|
||||
{
|
||||
var id = TrackedOperationId.New();
|
||||
await store.RecordEnqueueAsync(
|
||||
id,
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
// Fire 10 concurrent reads + a write in parallel; all must complete.
|
||||
var readTasks = Enumerable.Range(0, 10)
|
||||
.Select(_ => store.GetStatusAsync(id))
|
||||
.ToArray();
|
||||
var writeTask = store.RecordAttemptAsync(
|
||||
id, status: "Retrying", retryCount: 1, lastError: "transient", httpStatus: 503);
|
||||
|
||||
await Task.WhenAll(readTasks);
|
||||
await writeTask;
|
||||
|
||||
foreach (var t in readTasks)
|
||||
{
|
||||
Assert.NotNull(await t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread()
|
||||
{
|
||||
// Regression test for SiteRuntime-024 (deadlock half). Pre-fix, Dispose
|
||||
// bridged to async via DisposeAsyncCore().AsTask().GetAwaiter().GetResult()
|
||||
// — sync-over-async on a SemaphoreSlim can deadlock under a non-reentrant
|
||||
// SyncContext (host shutdown continuations). Post-fix, Dispose runs
|
||||
// synchronously without acquiring the gate.
|
||||
var (store, _) = CreateStore(nameof(SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread));
|
||||
|
||||
// Seed a row so the store has live state when disposed.
|
||||
await store.RecordEnqueueAsync(
|
||||
TrackedOperationId.New(),
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
var disposeReturned = new TaskCompletionSource<bool>();
|
||||
var disposeThread = new Thread(() =>
|
||||
{
|
||||
try
|
||||
{
|
||||
store.Dispose();
|
||||
disposeReturned.SetResult(true);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
disposeReturned.SetException(ex);
|
||||
}
|
||||
}) { IsBackground = true };
|
||||
|
||||
disposeThread.Start();
|
||||
|
||||
// 5s ceiling — if Dispose deadlocks, the test fails with TimeoutException.
|
||||
var completed = await Task.WhenAny(
|
||||
disposeReturned.Task, Task.Delay(TimeSpan.FromSeconds(5)));
|
||||
Assert.Same(disposeReturned.Task, completed);
|
||||
Assert.True(await disposeReturned.Task);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent()
|
||||
{
|
||||
// The async path must also tolerate Dispose() being called afterwards
|
||||
// (host shutdown's standard pattern). The _disposeState exchange should
|
||||
// short-circuit the second call.
|
||||
var (store, _) = CreateStore(nameof(SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent));
|
||||
|
||||
await store.RecordEnqueueAsync(
|
||||
TrackedOperationId.New(),
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
await store.DisposeAsync();
|
||||
// Second call must be a no-op, not throw.
|
||||
store.Dispose();
|
||||
// And a third async — also a no-op.
|
||||
await store.DisposeAsync();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user