fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR
CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the at-least-once livelock on the site→central notification handoff. DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent same-tag subscribes don't orphan an adapter handle; delete the latent dead _subscriptionHandles dictionary; stop double-counting _totalSubscribed when an unresolved tag is promoted via another instance; release adapter handles on mid-flight unsubscribe; gate the tag-resolution retry timer with IsTimerActive so subscribe bursts don't reset it into starvation. SR-020: add _terminatingActorsByName shadow so a third deploy arriving during a pending redeploy doesn't crash on InvalidActorNameException — displaced senders get a Failed/superseded response and the latest command wins on Terminated. SR-024: split OperationTrackingStore reads from writes (fresh SqliteConnection per GetStatusAsync) so long writes don't block status queries; rewrite Dispose to drop the sync-over-async bridge that could deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the dispose-once flag race-safe across both paths.
This commit is contained in:
@@ -438,4 +438,115 @@ public class OperationTrackingStoreTests
|
||||
Assert.NotNull(await store.GetStatusAsync(bId)); // kept (recent terminal)
|
||||
Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal)
|
||||
}
|
||||
|
||||
// ── SiteRuntime-024: read/write split + sync-safe Dispose ──────────────
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_ConcurrentReads_DoNotBlockOnInFlightWrite()
|
||||
{
|
||||
// Regression test for SiteRuntime-024 (perf half). Pre-fix, every
|
||||
// GetStatusAsync took the same _gate as RecordTerminalAsync, so a single
|
||||
// long-running write would queue up every concurrent status query. After
|
||||
// the fix, reads open a fresh SqliteConnection per call and don't take
|
||||
// the write gate at all — so they should run concurrently with a write.
|
||||
//
|
||||
// The test seeds a row, then issues many parallel reads while a write is
|
||||
// also in flight. We assert the reads return successfully (a regression
|
||||
// would either deadlock the test runner or take far longer than the gate
|
||||
// would have allowed any single read). The actual timing-comparison
|
||||
// assertion would be flaky in CI; this test asserts only correctness +
|
||||
// forward progress.
|
||||
var (store, _) = CreateStore(nameof(SR024_ConcurrentReads_DoNotBlockOnInFlightWrite));
|
||||
await using (store)
|
||||
{
|
||||
var id = TrackedOperationId.New();
|
||||
await store.RecordEnqueueAsync(
|
||||
id,
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
// Fire 10 concurrent reads + a write in parallel; all must complete.
|
||||
var readTasks = Enumerable.Range(0, 10)
|
||||
.Select(_ => store.GetStatusAsync(id))
|
||||
.ToArray();
|
||||
var writeTask = store.RecordAttemptAsync(
|
||||
id, status: "Retrying", retryCount: 1, lastError: "transient", httpStatus: 503);
|
||||
|
||||
await Task.WhenAll(readTasks);
|
||||
await writeTask;
|
||||
|
||||
foreach (var t in readTasks)
|
||||
{
|
||||
Assert.NotNull(await t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread()
|
||||
{
|
||||
// Regression test for SiteRuntime-024 (deadlock half). Pre-fix, Dispose
|
||||
// bridged to async via DisposeAsyncCore().AsTask().GetAwaiter().GetResult()
|
||||
// — sync-over-async on a SemaphoreSlim can deadlock under a non-reentrant
|
||||
// SyncContext (host shutdown continuations). Post-fix, Dispose runs
|
||||
// synchronously without acquiring the gate.
|
||||
var (store, _) = CreateStore(nameof(SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread));
|
||||
|
||||
// Seed a row so the store has live state when disposed.
|
||||
await store.RecordEnqueueAsync(
|
||||
TrackedOperationId.New(),
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
var disposeReturned = new TaskCompletionSource<bool>();
|
||||
var disposeThread = new Thread(() =>
|
||||
{
|
||||
try
|
||||
{
|
||||
store.Dispose();
|
||||
disposeReturned.SetResult(true);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
disposeReturned.SetException(ex);
|
||||
}
|
||||
}) { IsBackground = true };
|
||||
|
||||
disposeThread.Start();
|
||||
|
||||
// 5s ceiling — if Dispose deadlocks, the test fails with TimeoutException.
|
||||
var completed = await Task.WhenAny(
|
||||
disposeReturned.Task, Task.Delay(TimeSpan.FromSeconds(5)));
|
||||
Assert.Same(disposeReturned.Task, completed);
|
||||
Assert.True(await disposeReturned.Task);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent()
|
||||
{
|
||||
// The async path must also tolerate Dispose() being called afterwards
|
||||
// (host shutdown's standard pattern). The _disposeState exchange should
|
||||
// short-circuit the second call.
|
||||
var (store, _) = CreateStore(nameof(SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent));
|
||||
|
||||
await store.RecordEnqueueAsync(
|
||||
TrackedOperationId.New(),
|
||||
kind: "ApiCallCached",
|
||||
targetSummary: "ERP.GetOrder",
|
||||
sourceInstanceId: null,
|
||||
sourceScript: null,
|
||||
sourceNode: "node-a");
|
||||
|
||||
await store.DisposeAsync();
|
||||
// Second call must be a no-op, not throw.
|
||||
store.Dispose();
|
||||
// And a third async — also a no-op.
|
||||
await store.DisposeAsync();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user