fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR

CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL
IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the
at-least-once livelock on the site→central notification handoff.

DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent
same-tag subscribes don't orphan an adapter handle; delete the latent
dead _subscriptionHandles dictionary; stop double-counting
_totalSubscribed when an unresolved tag is promoted via another instance;
release adapter handles on mid-flight unsubscribe; gate the
tag-resolution retry timer with IsTimerActive so subscribe bursts don't
reset it into starvation.

SR-020: add _terminatingActorsByName shadow so a third deploy arriving
during a pending redeploy doesn't crash on InvalidActorNameException —
displaced senders get a Failed/superseded response and the latest
command wins on Terminated.

SR-024: split OperationTrackingStore reads from writes (fresh
SqliteConnection per GetStatusAsync) so long writes don't block status
queries; rewrite Dispose to drop the sync-over-async bridge that could
deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the
dispose-once flag race-safe across both paths.
This commit is contained in:
Joseph Doherty
2026-05-28 05:20:13 -04:00
parent 5d2386cc9d
commit f936f55f51
15 changed files with 1152 additions and 170 deletions
@@ -438,4 +438,115 @@ public class OperationTrackingStoreTests
Assert.NotNull(await store.GetStatusAsync(bId)); // kept (recent terminal)
Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal)
}
// ── SiteRuntime-024: read/write split + sync-safe Dispose ──────────────
[Fact]
public async Task SR024_ConcurrentReads_DoNotBlockOnInFlightWrite()
{
// Regression test for SiteRuntime-024 (perf half). Pre-fix, every
// GetStatusAsync took the same _gate as RecordTerminalAsync, so a single
// long-running write would queue up every concurrent status query. After
// the fix, reads open a fresh SqliteConnection per call and don't take
// the write gate at all — so they should run concurrently with a write.
//
// The test seeds a row, then issues many parallel reads while a write is
// also in flight. We assert the reads return successfully (a regression
// would either deadlock the test runner or take far longer than the gate
// would have allowed any single read). The actual timing-comparison
// assertion would be flaky in CI; this test asserts only correctness +
// forward progress.
var (store, _) = CreateStore(nameof(SR024_ConcurrentReads_DoNotBlockOnInFlightWrite));
await using (store)
{
var id = TrackedOperationId.New();
await store.RecordEnqueueAsync(
id,
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
// Fire 10 concurrent reads + a write in parallel; all must complete.
var readTasks = Enumerable.Range(0, 10)
.Select(_ => store.GetStatusAsync(id))
.ToArray();
var writeTask = store.RecordAttemptAsync(
id, status: "Retrying", retryCount: 1, lastError: "transient", httpStatus: 503);
await Task.WhenAll(readTasks);
await writeTask;
foreach (var t in readTasks)
{
Assert.NotNull(await t);
}
}
}
[Fact]
public async Task SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread()
{
// Regression test for SiteRuntime-024 (deadlock half). Pre-fix, Dispose
// bridged to async via DisposeAsyncCore().AsTask().GetAwaiter().GetResult()
// — sync-over-async on a SemaphoreSlim can deadlock under a non-reentrant
// SyncContext (host shutdown continuations). Post-fix, Dispose runs
// synchronously without acquiring the gate.
var (store, _) = CreateStore(nameof(SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread));
// Seed a row so the store has live state when disposed.
await store.RecordEnqueueAsync(
TrackedOperationId.New(),
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
var disposeReturned = new TaskCompletionSource<bool>();
var disposeThread = new Thread(() =>
{
try
{
store.Dispose();
disposeReturned.SetResult(true);
}
catch (Exception ex)
{
disposeReturned.SetException(ex);
}
}) { IsBackground = true };
disposeThread.Start();
// 5s ceiling — if Dispose deadlocks, the test fails with TimeoutException.
var completed = await Task.WhenAny(
disposeReturned.Task, Task.Delay(TimeSpan.FromSeconds(5)));
Assert.Same(disposeReturned.Task, completed);
Assert.True(await disposeReturned.Task);
}
[Fact]
public async Task SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent()
{
// The async path must also tolerate Dispose() being called afterwards
// (host shutdown's standard pattern). The _disposeState exchange should
// short-circuit the second call.
var (store, _) = CreateStore(nameof(SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent));
await store.RecordEnqueueAsync(
TrackedOperationId.New(),
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
await store.DisposeAsync();
// Second call must be a no-op, not throw.
store.Dispose();
// And a third async — also a no-op.
await store.DisposeAsync();
}
}