fix(data-connection): resolve DataConnectionLayer-001 — off-thread actor state mutation

HandleSubscribe spawned a Task.Run that mutated DataConnectionActor private
state (_subscriptionIds, _subscriptionsByInstance, _totalSubscribed,
_resolvedTags, _unresolvedTags) from a thread-pool thread, racing the actor's
own message loop — a data race on non-thread-safe Dictionary/HashSet and
non-atomic counters.

Restructured HandleSubscribe to follow the actor's existing PipeTo(Self)
pattern: the background task now performs only adapter I/O and pipes a
SubscribeCompleted message to Self; all subscription-state mutation happens
in the new HandleSubscribeCompleted handler on the actor thread (wired into
the Connected, Connecting and Reconnecting states).

Adds DCL001_ConcurrentSubscribes_DoNotCorruptSubscriptionCounters (30x30
concurrent subscribes) which fails against the pre-fix code and passes after.
This commit is contained in:
Joseph Doherty
2026-05-16 18:26:43 -04:00
parent 977d7369a7
commit 239bee3bc4
4 changed files with 213 additions and 54 deletions

View File

@@ -458,4 +458,87 @@ public class DataConnectionActorTests : TestKit
await backupAdapter.Received().SubscribeAsync(
"sensor/temp", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>());
}
// ── DataConnectionLayer-001: subscribe must not mutate actor state off-thread ──
private static async Task<string> DelayedSubscribeAsync()
{
// A short delay so concurrent subscribe background tasks pile up and their
// post-await state mutations would race under the pre-fix implementation.
await Task.Delay(1);
return "sub-" + Guid.NewGuid().ToString("N");
}
[Fact]
public async Task DCL001_ConcurrentSubscribes_DoNotCorruptSubscriptionCounters()
{
// Regression test for DataConnectionLayer-001. HandleSubscribe used to mutate
// actor state (_subscriptionIds, _totalSubscribed, _resolvedTags, the per-instance
// HashSet) from a Task.Run background thread. Many concurrent subscribes then race
// on non-thread-safe Dictionary/HashSet and on non-atomic int++ — losing increments
// or throwing. After the fix every mutation is applied on the actor thread via a
// SubscribeCompleted message, so the final counts are exact.
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(_ => DelayedSubscribeAsync());
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl001-concurrent");
await Task.Delay(300); // reach Connected state
const int instances = 30;
const int tagsPerInstance = 30;
for (var i = 0; i < instances; i++)
{
var tags = Enumerable.Range(0, tagsPerInstance)
.Select(j => $"inst{i}/tag{j}")
.ToArray();
actor.Tell(new SubscribeTagsRequest(
$"corr{i}", $"inst{i}", "dcl001-concurrent", tags, DateTimeOffset.UtcNow));
}
// Every subscribe must be acknowledged.
for (var i = 0; i < instances; i++)
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(15));
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
// Every tag is distinct, so each is a fresh, resolved subscription.
Assert.Equal(instances * tagsPerInstance, report.TotalSubscribedTags);
Assert.Equal(instances * tagsPerInstance, report.ResolvedTags);
}
[Fact]
public async Task DCL001_SubscribeWithFailedTags_CountsResolvedAndUnresolvedSeparately()
{
// Behavioural guard: the restructured subscribe must preserve the original
// accounting — failed tags count toward TotalSubscribed but not ResolvedTags.
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(ci => ((string)ci[0]).StartsWith("bad")
? Task.FromException<string>(new Exception("tag not found"))
: Task.FromResult("sub-ok"));
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl001-failed-tags");
await Task.Delay(300);
actor.Tell(new SubscribeTagsRequest(
"c1", "inst1", "dcl001-failed-tags",
["good/a", "good/b", "good/c", "bad/x", "bad/y"], DateTimeOffset.UtcNow));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(3));
Assert.Equal(5, report.TotalSubscribedTags); // all 5 tags tracked
Assert.Equal(3, report.ResolvedTags); // only the 3 good ones resolved
}
}