fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR
CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the at-least-once livelock on the site→central notification handoff. DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent same-tag subscribes don't orphan an adapter handle; delete the latent dead _subscriptionHandles dictionary; stop double-counting _totalSubscribed when an unresolved tag is promoted via another instance; release adapter handles on mid-flight unsubscribe; gate the tag-resolution retry timer with IsTimerActive so subscribe bursts don't reset it into starvation. SR-020: add _terminatingActorsByName shadow so a third deploy arriving during a pending redeploy doesn't crash on InvalidActorNameException — displaced senders get a Failed/superseded response and the latest command wins on Terminated. SR-024: split OperationTrackingStore reads from writes (fresh SqliteConnection per GetStatusAsync) so long writes don't block status queries; rewrite Dispose to drop the sync-over-async bridge that could deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the dispose-once flag race-safe across both paths.
This commit is contained in:
@@ -730,6 +730,306 @@ public class DataConnectionActorTests : TestKit
|
||||
subscribeGate.SetCanceled();
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-018: concurrent same-tag subscribes must not orphan adapter handles ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL018_ConcurrentSubscribes_SameTag_DifferentInstances_IssueOneAdapterSubscribe()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-018. Before the fix, HandleSubscribe
|
||||
// snapshotted _subscriptionIds.Keys on the actor thread BEFORE the Task.Run
|
||||
// I/O. Two SubscribeTagsRequest messages for different instances sharing a tag
|
||||
// would both observe "not subscribed" (the first request's SubscribeCompleted
|
||||
// hadn't yet posted), both call _adapter.SubscribeAsync, and the second
|
||||
// subscription id would be silently dropped at the
|
||||
// _subscriptionIds.ContainsKey guard in HandleSubscribeCompleted — orphaning
|
||||
// the adapter's monitored item permanently. With the _subscribesInFlight
|
||||
// guard, the second request observes the tag in flight and treats it as
|
||||
// AlreadySubscribed without issuing a second adapter call.
|
||||
var subscribeStartedFirst = new TaskCompletionSource();
|
||||
var releaseFirst = new TaskCompletionSource<string>();
|
||||
var subscribeCallCount = 0;
|
||||
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync("shared/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ =>
|
||||
{
|
||||
var n = Interlocked.Increment(ref subscribeCallCount);
|
||||
if (n == 1)
|
||||
{
|
||||
// Park the first subscribe so the second SubscribeTagsRequest
|
||||
// arrives on the actor thread while the first I/O is still in flight.
|
||||
subscribeStartedFirst.TrySetResult();
|
||||
return releaseFirst.Task;
|
||||
}
|
||||
// The fix prevents this branch — fail loudly if it ever runs.
|
||||
return Task.FromResult("sub-unexpected-" + n);
|
||||
});
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl018-shared-tag");
|
||||
await Task.Delay(300);
|
||||
|
||||
// Request 1 — instance A. Park its adapter call in flight.
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c1", "instA", "dcl018-shared-tag", ["shared/tag"], DateTimeOffset.UtcNow));
|
||||
await subscribeStartedFirst.Task.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Request 2 — instance B, same tag, sent while instance A's I/O is still in flight.
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c2", "instB", "dcl018-shared-tag", ["shared/tag"], DateTimeOffset.UtcNow));
|
||||
|
||||
// Instance B's ack must come back before we release instance A — that proves
|
||||
// instance B's request did NOT issue its own adapter SubscribeAsync (which is
|
||||
// blocked) and instead saw the tag as in-flight.
|
||||
SubscribeTagsResponse? bResponse = null;
|
||||
for (var i = 0; i < 50 && bResponse is null; i++)
|
||||
{
|
||||
try { bResponse = ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromMilliseconds(100)); }
|
||||
catch { /* keep polling */ }
|
||||
if (bResponse?.InstanceUniqueName != "instB") bResponse = null;
|
||||
}
|
||||
|
||||
Assert.NotNull(bResponse);
|
||||
Assert.True(bResponse!.Success);
|
||||
Assert.Equal(1, Volatile.Read(ref subscribeCallCount));
|
||||
|
||||
// Release instance A's subscribe so the test cleans up.
|
||||
releaseFirst.SetResult("sub-shared");
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
actor.Tell(new DataConnectionActor.GetHealthReport());
|
||||
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
|
||||
// One tag shared across two instances must count as exactly one subscription.
|
||||
// DCL-020 (#28) tracks the related "previously-unresolved tag promoted via
|
||||
// another instance" double-count case; here the tag was never unresolved.
|
||||
Assert.Equal(1, report.ResolvedTags);
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-020: previously-unresolved tag, resolved via different instance, must not double-count ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL020_UnresolvedTagPromoted_ByDifferentInstance_DoesNotDoubleCountTotalSubscribed()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-020. The first SubscribeTagsRequest
|
||||
// (instance A, tag "promote/tag") fails at the resolution layer — the tag is
|
||||
// added to _unresolvedTags AND _totalSubscribed is bumped to 1. The second
|
||||
// SubscribeTagsRequest (instance B, same tag) succeeds the adapter call. Before
|
||||
// the fix, HandleSubscribeCompleted's success branch unconditionally
|
||||
// ++_totalSubscribed, taking the total to 2 — even though the logical
|
||||
// subscription count is still 1. After the fix the success branch detects
|
||||
// the unresolved-tag promotion and increments only _resolvedTags.
|
||||
var subscribeCalls = 0;
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync("promote/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ =>
|
||||
{
|
||||
var n = Interlocked.Increment(ref subscribeCalls);
|
||||
if (n == 1) return Task.FromException<string>(new KeyNotFoundException("not yet"));
|
||||
return Task.FromResult("sub-promote-" + n);
|
||||
});
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl020-promote");
|
||||
await Task.Delay(300);
|
||||
|
||||
// Instance A — fails at resolution → _unresolvedTags has the tag, _totalSubscribed=1.
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c1", "instA", "dcl020-promote", ["promote/tag"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5)); // bad-quality push
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Instance B — same tag, but this time the adapter succeeds (n==2 branch).
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c2", "instB", "dcl020-promote", ["promote/tag"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
actor.Tell(new DataConnectionActor.GetHealthReport());
|
||||
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Pre-fix: TotalSubscribedTags=2, ResolvedTags=1 (double-count).
|
||||
// Post-fix: TotalSubscribedTags=1, ResolvedTags=1 — one logical tag, one resolved.
|
||||
Assert.Equal(1, report.TotalSubscribedTags);
|
||||
Assert.Equal(1, report.ResolvedTags);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DCL020_TwoInstancesFailingSameTag_OnlyCountsTagOnceInTotal()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-020's symmetric failure branch.
|
||||
// Two instances both fail to resolve the same tag — _unresolvedTags must hold
|
||||
// a single entry and _totalSubscribed must be 1, not 2. Pre-fix the failure
|
||||
// branch always ran _totalSubscribed++, double-counting on the second
|
||||
// instance's failure.
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync("missing/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromException<string>(new KeyNotFoundException("not found")));
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl020-twofail");
|
||||
await Task.Delay(300);
|
||||
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c1", "instA", "dcl020-twofail", ["missing/tag"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c2", "instB", "dcl020-twofail", ["missing/tag"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
actor.Tell(new DataConnectionActor.GetHealthReport());
|
||||
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
|
||||
|
||||
Assert.Equal(1, report.TotalSubscribedTags);
|
||||
Assert.Equal(0, report.ResolvedTags);
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-021: mid-flight unsubscribe must release adapter handle and drop state ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL021_UnsubscribeDuringInFlightSubscribe_ReleasesAdapterHandle_AndKeepsStateClean()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-021. Previously HandleSubscribeCompleted
|
||||
// re-created _subscriptionsByInstance[instanceName] when the instance had been
|
||||
// unsubscribed while the subscribe I/O was in flight, and then ran the same
|
||||
// counter/handle mutations as the happy path. The leak permanently inflated
|
||||
// _subscriptionsByInstance, _tagSubscriberCount, and _totalSubscribed (also re-
|
||||
// issued by ReSubscribeAll after every reconnect), and orphaned the adapter
|
||||
// monitored item. After the fix, the stale completion is logged + dropped, and
|
||||
// _adapter.UnsubscribeAsync is fired for each successful subscription id.
|
||||
var subscribeStarted = new TaskCompletionSource();
|
||||
var releaseSubscribe = new TaskCompletionSource<string>();
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync("stale/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ =>
|
||||
{
|
||||
subscribeStarted.TrySetResult();
|
||||
return releaseSubscribe.Task;
|
||||
});
|
||||
_mockAdapter.UnsubscribeAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl021-mid-flight");
|
||||
await Task.Delay(300);
|
||||
|
||||
// Subscribe instance A — block the adapter call so unsubscribe arrives first.
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
"c1", "instA", "dcl021-mid-flight", ["stale/tag"], DateTimeOffset.UtcNow));
|
||||
await subscribeStarted.Task.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Unsubscribe instance A while the subscribe I/O is still parked.
|
||||
actor.Tell(new UnsubscribeTagsRequest("unsub-c1", "instA", "dcl021-mid-flight", DateTimeOffset.UtcNow));
|
||||
await Task.Delay(100);
|
||||
|
||||
// Release the subscribe — SubscribeCompleted is now stale.
|
||||
releaseSubscribe.SetResult("sub-orphan");
|
||||
|
||||
// Wait for SubscribeTagsResponse OR a quiescence interval. The fix may skip
|
||||
// the response (instance is gone); allow either outcome but require the
|
||||
// adapter UnsubscribeAsync call to have fired.
|
||||
await Task.Delay(500);
|
||||
|
||||
await _mockAdapter.Received(1).UnsubscribeAsync(
|
||||
Arg.Is<string>(s => s == "sub-orphan"), Arg.Any<CancellationToken>());
|
||||
|
||||
actor.Tell(new DataConnectionActor.GetHealthReport());
|
||||
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
|
||||
// Total / resolved must reflect the post-unsubscribe state: 0 tags subscribed.
|
||||
Assert.Equal(0, report.TotalSubscribedTags);
|
||||
Assert.Equal(0, report.ResolvedTags);
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-022: tag-resolution retry timer must not reset on every failure ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL022_BurstedFailedSubscribes_DoNotResetRetryTimer()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-022. Both HandleSubscribeCompleted
|
||||
// and HandleTagResolutionFailed previously called Timers.StartPeriodicTimer
|
||||
// unconditionally — StartPeriodicTimer with an existing key cancels and
|
||||
// replaces the prior timer, so a burst of SubscribeTagsRequests arriving
|
||||
// faster than TagResolutionRetryInterval would re-arm the timer every time
|
||||
// and starve the retry indefinitely. After the fix, IsTimerActive gates
|
||||
// the StartPeriodicTimer call so the first failure starts the timer and
|
||||
// subsequent failures just append to _unresolvedTags.
|
||||
_options.TagResolutionRetryInterval = TimeSpan.FromMilliseconds(300);
|
||||
|
||||
var subscribeCalls = 0;
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
// All subscribes fail at the resolution layer (KeyNotFoundException is a
|
||||
// non-connection failure → marks the tag unresolved and starts the timer).
|
||||
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ =>
|
||||
{
|
||||
Interlocked.Increment(ref subscribeCalls);
|
||||
return Task.FromException<string>(new KeyNotFoundException("not found"));
|
||||
});
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl022-retry-gate");
|
||||
await Task.Delay(300); // reach Connected
|
||||
|
||||
// Fire 5 SubscribeTagsRequests with distinct tags, each ~50ms apart so the
|
||||
// total burst (~250ms) is well under the 300ms retry interval. Pre-fix,
|
||||
// every failure called StartPeriodicTimer — the 5th call would cancel the
|
||||
// running timer and re-arm a fresh 300ms wait, pushing the first retry to
|
||||
// ~550ms after the first failure. Post-fix: timer starts once, fires at
|
||||
// ~300ms after the first failure regardless of subsequent failures.
|
||||
var t0 = DateTimeOffset.UtcNow;
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
actor.Tell(new SubscribeTagsRequest(
|
||||
$"c{i}", $"inst{i}", "dcl022-retry-gate", [$"burst/tag{i}"], DateTimeOffset.UtcNow));
|
||||
await Task.Delay(50);
|
||||
}
|
||||
|
||||
// Drain the 5 ack messages and the 5 bad-quality TagValueUpdates so we
|
||||
// don't accidentally compare them against the retry-induced subscribe count.
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
// Order: bad-quality TagValueUpdate fires first, then SubscribeTagsResponse.
|
||||
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
var initialFailureCalls = Volatile.Read(ref subscribeCalls);
|
||||
Assert.Equal(5, initialFailureCalls);
|
||||
|
||||
// Wait one retry interval past the first failure. If the timer was reset by
|
||||
// each subsequent failure, no retry has fired yet and subscribeCalls is
|
||||
// still 5. With the gate, the timer has fired and re-attempted every
|
||||
// unresolved tag (5 more calls).
|
||||
var firstFailureToNow = DateTimeOffset.UtcNow - t0;
|
||||
var waitForRetryFire = TimeSpan.FromMilliseconds(450) - firstFailureToNow;
|
||||
if (waitForRetryFire > TimeSpan.Zero)
|
||||
{
|
||||
await Task.Delay(waitForRetryFire);
|
||||
}
|
||||
|
||||
var afterFirstInterval = Volatile.Read(ref subscribeCalls);
|
||||
Assert.True(afterFirstInterval > initialFailureCalls,
|
||||
$"Retry timer should have fired within ~300ms of the first failure, " +
|
||||
$"but subscribeCalls stayed at {afterFirstInterval} (initial: {initialFailureCalls}). " +
|
||||
$"This is the DCL-022 reset-on-every-call starvation regression.");
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-011: stale callbacks from a disposed adapter must be dropped ──
|
||||
|
||||
[Fact]
|
||||
|
||||
Reference in New Issue
Block a user