fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR

CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL
IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the
at-least-once livelock on the site→central notification handoff.

DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent
same-tag subscribes don't orphan an adapter handle; delete the latent
dead _subscriptionHandles dictionary; stop double-counting
_totalSubscribed when an unresolved tag is promoted via another instance;
release adapter handles on mid-flight unsubscribe; gate the
tag-resolution retry timer with IsTimerActive so subscribe bursts don't
reset it into starvation.

SR-020: add _terminatingActorsByName shadow so a third deploy arriving
during a pending redeploy doesn't crash on InvalidActorNameException —
displaced senders get a Failed/superseded response and the latest
command wins on Terminated.

SR-024: split OperationTrackingStore reads from writes (fresh
SqliteConnection per GetStatusAsync) so long writes don't block status
queries; rewrite Dispose to drop the sync-over-async bridge that could
deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the
dispose-once flag race-safe across both paths.
This commit is contained in:
Joseph Doherty
2026-05-28 05:20:13 -04:00
parent 5d2386cc9d
commit f936f55f51
15 changed files with 1152 additions and 170 deletions
@@ -0,0 +1,132 @@
using Microsoft.EntityFrameworkCore;
using ScadaLink.Commons.Entities.Notifications;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.ConfigurationDatabase;
using ScadaLink.ConfigurationDatabase.Repositories;
using ScadaLink.ConfigurationDatabase.Tests.Migrations;
using Xunit;
namespace ScadaLink.ConfigurationDatabase.Tests.Repositories;
/// <summary>
/// CD-015 race-fix integration tests for
/// <see cref="NotificationOutboxRepository.InsertIfNotExistsAsync"/>. The method
/// is raw-SQL (<c>IF NOT EXISTS … INSERT</c>) matching the AuditLog and SiteCalls
/// idempotent-insert pattern; it must execute against a real SQL Server schema,
/// so this class uses <see cref="MsSqlMigrationFixture"/> rather than the SQLite
/// in-memory provider used by <see cref="RepositoryCoverageTests"/>.
/// </summary>
public class NotificationOutboxRepositoryIntegrationTests : IClassFixture<MsSqlMigrationFixture>
{
private readonly MsSqlMigrationFixture _fixture;
public NotificationOutboxRepositoryIntegrationTests(MsSqlMigrationFixture fixture)
{
_fixture = fixture;
}
[SkippableFact]
public async Task InsertIfNotExistsAsync_FreshId_InsertsAndReturnsTrue()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
var id = Guid.NewGuid().ToString();
await using var context = CreateContext();
var repo = new NotificationOutboxRepository(context);
var inserted = await repo.InsertIfNotExistsAsync(MakeNotification(id));
Assert.True(inserted);
await using var readContext = CreateContext();
var loaded = await readContext.Notifications.FindAsync(id);
Assert.NotNull(loaded);
Assert.Equal("Subject", loaded!.Subject);
}
[SkippableFact]
public async Task InsertIfNotExistsAsync_DuplicateId_ReturnsFalseAndLeavesExistingRow()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
var id = Guid.NewGuid().ToString();
await using (var context = CreateContext())
{
var repo = new NotificationOutboxRepository(context);
await repo.InsertIfNotExistsAsync(MakeNotification(id, subject: "Original"));
}
await using (var context = CreateContext())
{
var repo = new NotificationOutboxRepository(context);
var inserted = await repo.InsertIfNotExistsAsync(MakeNotification(id, subject: "Changed"));
Assert.False(inserted);
}
await using var readContext = CreateContext();
var loaded = await readContext.Notifications.FindAsync(id);
Assert.NotNull(loaded);
Assert.Equal("Original", loaded!.Subject);
}
[SkippableFact]
public async Task InsertIfNotExistsAsync_ConcurrentInserts_SameId_OnlyOneRow()
{
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
// CD-015 race coverage. The IF NOT EXISTS … INSERT pattern has a
// check-then-act window: two concurrent sessions can both pass the
// EXISTS check and both attempt the INSERT — the loser surfaces as a
// SqlException with Number 2601/2627. The site→central handoff is
// documented at-least-once with insert-if-not-exists, so this collision
// IS the expected contention mode. The race losers MUST be swallowed
// (not bubbled) so the site doesn't retry the same NotificationId
// forever. Final row count must be exactly 1; no exceptions thrown.
var id = Guid.NewGuid().ToString();
await Parallel.ForEachAsync(
Enumerable.Range(0, 50),
new ParallelOptions { MaxDegreeOfParallelism = 50 },
async (_, ct) =>
{
await using var context = CreateContext();
var repo = new NotificationOutboxRepository(context);
await repo.InsertIfNotExistsAsync(MakeNotification(id), ct);
});
await using var readContext = CreateContext();
var count = await readContext.Notifications
.Where(n => n.NotificationId == id)
.CountAsync();
Assert.Equal(1, count);
}
// --- helpers ------------------------------------------------------------
private ScadaLinkDbContext CreateContext()
{
var options = new DbContextOptionsBuilder<ScadaLinkDbContext>()
.UseSqlServer(_fixture.ConnectionString)
.Options;
return new ScadaLinkDbContext(options);
}
private static Notification MakeNotification(
string id,
NotificationStatus status = NotificationStatus.Pending,
string subject = "Subject")
{
return new Notification(
id,
NotificationType.Email,
"Ops List",
subject,
"Body",
"site-cd015")
{
Status = status,
CreatedAt = new DateTimeOffset(2026, 5, 20, 10, 0, 0, TimeSpan.Zero),
SiteEnqueuedAt = new DateTimeOffset(2026, 5, 20, 9, 59, 0, TimeSpan.Zero),
};
}
}
@@ -419,32 +419,10 @@ public class NotificationOutboxRepositoryTests : IDisposable
};
}
[Fact]
public async Task InsertIfNotExistsAsync_NewRow_InsertsAndReturnsTrue()
{
var id = Guid.NewGuid().ToString();
var inserted = await _repository.InsertIfNotExistsAsync(MakeNotification(id));
Assert.True(inserted);
_context.ChangeTracker.Clear();
Assert.NotNull(await _context.Notifications.FindAsync(id));
}
[Fact]
public async Task InsertIfNotExistsAsync_DuplicateId_ReturnsFalseAndLeavesExistingRow()
{
var id = Guid.NewGuid().ToString();
await _repository.InsertIfNotExistsAsync(MakeNotification(id, subject: "Original"));
_context.ChangeTracker.Clear();
var inserted = await _repository.InsertIfNotExistsAsync(MakeNotification(id, subject: "Changed"));
Assert.False(inserted);
_context.ChangeTracker.Clear();
var loaded = await _context.Notifications.FindAsync(id);
Assert.Equal("Original", loaded!.Subject);
}
// InsertIfNotExistsAsync coverage lives in
// tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/NotificationOutboxRepositoryIntegrationTests.cs
// — the method is raw-SQL (IF NOT EXISTS … INSERT) so it must execute against
// SQL Server, not the SQLite in-memory provider this class uses.
[Fact]
public async Task GetDueAsync_ReturnsPendingAndDueRetrying_OrderedByCreatedAt_CappedAtBatchSize()
@@ -730,6 +730,306 @@ public class DataConnectionActorTests : TestKit
subscribeGate.SetCanceled();
}
// ── DataConnectionLayer-018: concurrent same-tag subscribes must not orphan adapter handles ──
[Fact]
public async Task DCL018_ConcurrentSubscribes_SameTag_DifferentInstances_IssueOneAdapterSubscribe()
{
// Regression test for DataConnectionLayer-018. Before the fix, HandleSubscribe
// snapshotted _subscriptionIds.Keys on the actor thread BEFORE the Task.Run
// I/O. Two SubscribeTagsRequest messages for different instances sharing a tag
// would both observe "not subscribed" (the first request's SubscribeCompleted
// hadn't yet posted), both call _adapter.SubscribeAsync, and the second
// subscription id would be silently dropped at the
// _subscriptionIds.ContainsKey guard in HandleSubscribeCompleted — orphaning
// the adapter's monitored item permanently. With the _subscribesInFlight
// guard, the second request observes the tag in flight and treats it as
// AlreadySubscribed without issuing a second adapter call.
var subscribeStartedFirst = new TaskCompletionSource();
var releaseFirst = new TaskCompletionSource<string>();
var subscribeCallCount = 0;
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync("shared/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(_ =>
{
var n = Interlocked.Increment(ref subscribeCallCount);
if (n == 1)
{
// Park the first subscribe so the second SubscribeTagsRequest
// arrives on the actor thread while the first I/O is still in flight.
subscribeStartedFirst.TrySetResult();
return releaseFirst.Task;
}
// The fix prevents this branch — fail loudly if it ever runs.
return Task.FromResult("sub-unexpected-" + n);
});
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl018-shared-tag");
await Task.Delay(300);
// Request 1 — instance A. Park its adapter call in flight.
actor.Tell(new SubscribeTagsRequest(
"c1", "instA", "dcl018-shared-tag", ["shared/tag"], DateTimeOffset.UtcNow));
await subscribeStartedFirst.Task.WaitAsync(TimeSpan.FromSeconds(5));
// Request 2 — instance B, same tag, sent while instance A's I/O is still in flight.
actor.Tell(new SubscribeTagsRequest(
"c2", "instB", "dcl018-shared-tag", ["shared/tag"], DateTimeOffset.UtcNow));
// Instance B's ack must come back before we release instance A — that proves
// instance B's request did NOT issue its own adapter SubscribeAsync (which is
// blocked) and instead saw the tag as in-flight.
SubscribeTagsResponse? bResponse = null;
for (var i = 0; i < 50 && bResponse is null; i++)
{
try { bResponse = ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromMilliseconds(100)); }
catch { /* keep polling */ }
if (bResponse?.InstanceUniqueName != "instB") bResponse = null;
}
Assert.NotNull(bResponse);
Assert.True(bResponse!.Success);
Assert.Equal(1, Volatile.Read(ref subscribeCallCount));
// Release instance A's subscribe so the test cleans up.
releaseFirst.SetResult("sub-shared");
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
// One tag shared across two instances must count as exactly one subscription.
// DCL-020 (#28) tracks the related "previously-unresolved tag promoted via
// another instance" double-count case; here the tag was never unresolved.
Assert.Equal(1, report.ResolvedTags);
}
// ── DataConnectionLayer-020: previously-unresolved tag, resolved via different instance, must not double-count ──
[Fact]
public async Task DCL020_UnresolvedTagPromoted_ByDifferentInstance_DoesNotDoubleCountTotalSubscribed()
{
// Regression test for DataConnectionLayer-020. The first SubscribeTagsRequest
// (instance A, tag "promote/tag") fails at the resolution layer — the tag is
// added to _unresolvedTags AND _totalSubscribed is bumped to 1. The second
// SubscribeTagsRequest (instance B, same tag) succeeds the adapter call. Before
// the fix, HandleSubscribeCompleted's success branch unconditionally
// ++_totalSubscribed, taking the total to 2 — even though the logical
// subscription count is still 1. After the fix the success branch detects
// the unresolved-tag promotion and increments only _resolvedTags.
var subscribeCalls = 0;
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync("promote/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(_ =>
{
var n = Interlocked.Increment(ref subscribeCalls);
if (n == 1) return Task.FromException<string>(new KeyNotFoundException("not yet"));
return Task.FromResult("sub-promote-" + n);
});
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl020-promote");
await Task.Delay(300);
// Instance A — fails at resolution → _unresolvedTags has the tag, _totalSubscribed=1.
actor.Tell(new SubscribeTagsRequest(
"c1", "instA", "dcl020-promote", ["promote/tag"], DateTimeOffset.UtcNow));
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5)); // bad-quality push
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
// Instance B — same tag, but this time the adapter succeeds (n==2 branch).
actor.Tell(new SubscribeTagsRequest(
"c2", "instB", "dcl020-promote", ["promote/tag"], DateTimeOffset.UtcNow));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
// Pre-fix: TotalSubscribedTags=2, ResolvedTags=1 (double-count).
// Post-fix: TotalSubscribedTags=1, ResolvedTags=1 — one logical tag, one resolved.
Assert.Equal(1, report.TotalSubscribedTags);
Assert.Equal(1, report.ResolvedTags);
}
[Fact]
public async Task DCL020_TwoInstancesFailingSameTag_OnlyCountsTagOnceInTotal()
{
// Regression test for DataConnectionLayer-020's symmetric failure branch.
// Two instances both fail to resolve the same tag — _unresolvedTags must hold
// a single entry and _totalSubscribed must be 1, not 2. Pre-fix the failure
// branch always ran _totalSubscribed++, double-counting on the second
// instance's failure.
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync("missing/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(Task.FromException<string>(new KeyNotFoundException("not found")));
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl020-twofail");
await Task.Delay(300);
actor.Tell(new SubscribeTagsRequest(
"c1", "instA", "dcl020-twofail", ["missing/tag"], DateTimeOffset.UtcNow));
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
actor.Tell(new SubscribeTagsRequest(
"c2", "instB", "dcl020-twofail", ["missing/tag"], DateTimeOffset.UtcNow));
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
Assert.Equal(1, report.TotalSubscribedTags);
Assert.Equal(0, report.ResolvedTags);
}
// ── DataConnectionLayer-021: mid-flight unsubscribe must release adapter handle and drop state ──
[Fact]
public async Task DCL021_UnsubscribeDuringInFlightSubscribe_ReleasesAdapterHandle_AndKeepsStateClean()
{
// Regression test for DataConnectionLayer-021. Previously HandleSubscribeCompleted
// re-created _subscriptionsByInstance[instanceName] when the instance had been
// unsubscribed while the subscribe I/O was in flight, and then ran the same
// counter/handle mutations as the happy path. The leak permanently inflated
// _subscriptionsByInstance, _tagSubscriberCount, and _totalSubscribed (also re-
// issued by ReSubscribeAll after every reconnect), and orphaned the adapter
// monitored item. After the fix, the stale completion is logged + dropped, and
// _adapter.UnsubscribeAsync is fired for each successful subscription id.
var subscribeStarted = new TaskCompletionSource();
var releaseSubscribe = new TaskCompletionSource<string>();
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync("stale/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(_ =>
{
subscribeStarted.TrySetResult();
return releaseSubscribe.Task;
});
_mockAdapter.UnsubscribeAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl021-mid-flight");
await Task.Delay(300);
// Subscribe instance A — block the adapter call so unsubscribe arrives first.
actor.Tell(new SubscribeTagsRequest(
"c1", "instA", "dcl021-mid-flight", ["stale/tag"], DateTimeOffset.UtcNow));
await subscribeStarted.Task.WaitAsync(TimeSpan.FromSeconds(5));
// Unsubscribe instance A while the subscribe I/O is still parked.
actor.Tell(new UnsubscribeTagsRequest("unsub-c1", "instA", "dcl021-mid-flight", DateTimeOffset.UtcNow));
await Task.Delay(100);
// Release the subscribe — SubscribeCompleted is now stale.
releaseSubscribe.SetResult("sub-orphan");
// Wait for SubscribeTagsResponse OR a quiescence interval. The fix may skip
// the response (instance is gone); allow either outcome but require the
// adapter UnsubscribeAsync call to have fired.
await Task.Delay(500);
await _mockAdapter.Received(1).UnsubscribeAsync(
Arg.Is<string>(s => s == "sub-orphan"), Arg.Any<CancellationToken>());
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(5));
// Total / resolved must reflect the post-unsubscribe state: 0 tags subscribed.
Assert.Equal(0, report.TotalSubscribedTags);
Assert.Equal(0, report.ResolvedTags);
}
// ── DataConnectionLayer-022: tag-resolution retry timer must not reset on every failure ──
[Fact]
public async Task DCL022_BurstedFailedSubscribes_DoNotResetRetryTimer()
{
// Regression test for DataConnectionLayer-022. Both HandleSubscribeCompleted
// and HandleTagResolutionFailed previously called Timers.StartPeriodicTimer
// unconditionally — StartPeriodicTimer with an existing key cancels and
// replaces the prior timer, so a burst of SubscribeTagsRequests arriving
// faster than TagResolutionRetryInterval would re-arm the timer every time
// and starve the retry indefinitely. After the fix, IsTimerActive gates
// the StartPeriodicTimer call so the first failure starts the timer and
// subsequent failures just append to _unresolvedTags.
_options.TagResolutionRetryInterval = TimeSpan.FromMilliseconds(300);
var subscribeCalls = 0;
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
// All subscribes fail at the resolution layer (KeyNotFoundException is a
// non-connection failure → marks the tag unresolved and starts the timer).
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(_ =>
{
Interlocked.Increment(ref subscribeCalls);
return Task.FromException<string>(new KeyNotFoundException("not found"));
});
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl022-retry-gate");
await Task.Delay(300); // reach Connected
// Fire 5 SubscribeTagsRequests with distinct tags, each ~50ms apart so the
// total burst (~250ms) is well under the 300ms retry interval. Pre-fix,
// every failure called StartPeriodicTimer — the 5th call would cancel the
// running timer and re-arm a fresh 300ms wait, pushing the first retry to
// ~550ms after the first failure. Post-fix: timer starts once, fires at
// ~300ms after the first failure regardless of subsequent failures.
var t0 = DateTimeOffset.UtcNow;
for (var i = 0; i < 5; i++)
{
actor.Tell(new SubscribeTagsRequest(
$"c{i}", $"inst{i}", "dcl022-retry-gate", [$"burst/tag{i}"], DateTimeOffset.UtcNow));
await Task.Delay(50);
}
// Drain the 5 ack messages and the 5 bad-quality TagValueUpdates so we
// don't accidentally compare them against the retry-induced subscribe count.
for (var i = 0; i < 5; i++)
{
// Order: bad-quality TagValueUpdate fires first, then SubscribeTagsResponse.
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
}
var initialFailureCalls = Volatile.Read(ref subscribeCalls);
Assert.Equal(5, initialFailureCalls);
// Wait one retry interval past the first failure. If the timer was reset by
// each subsequent failure, no retry has fired yet and subscribeCalls is
// still 5. With the gate, the timer has fired and re-attempted every
// unresolved tag (5 more calls).
var firstFailureToNow = DateTimeOffset.UtcNow - t0;
var waitForRetryFire = TimeSpan.FromMilliseconds(450) - firstFailureToNow;
if (waitForRetryFire > TimeSpan.Zero)
{
await Task.Delay(waitForRetryFire);
}
var afterFirstInterval = Volatile.Read(ref subscribeCalls);
Assert.True(afterFirstInterval > initialFailureCalls,
$"Retry timer should have fired within ~300ms of the first failure, " +
$"but subscribeCalls stayed at {afterFirstInterval} (initial: {initialFailureCalls}). " +
$"This is the DCL-022 reset-on-every-call starvation regression.");
}
// ── DataConnectionLayer-011: stale callbacks from a disposed adapter must be dropped ──
[Fact]
@@ -39,6 +39,42 @@ public class RealOpcUaClientThreadSafetyTests
}
}
/// <summary>
/// DataConnectionLayer-019: <see cref="OpcUaDataConnection"/> previously kept a
/// dead <c>Dictionary&lt;string,string&gt; _subscriptionHandles</c> field that was
/// written and removed across thread-pool continuations but never read. Plain
/// Dictionary writes from concurrent post-await continuations are racy; the
/// field was a latent bug waiting for any future reader. The fix deletes the
/// field rather than converting it to ConcurrentDictionary (bookkeeping already
/// lives in <c>RealOpcUaClient._monitoredItems/_callbacks</c> and
/// <c>DataConnectionActor._subscriptionIds</c>). This test guards against
/// regression — anyone re-introducing a non-concurrent shared dictionary on
/// the adapter must justify it explicitly.
/// </summary>
public class OpcUaDataConnectionThreadSafetyTests
{
[Fact]
public void DCL019_OpcUaDataConnection_HasNoNonConcurrentSharedDictionary()
{
// Reflection-walk every instance field on the adapter. Any
// System.Collections.Generic.Dictionary<,> field would be a regression:
// either dead state (return it) or live state mutated from continuations
// (convert to ConcurrentDictionary). Either way, fail the test.
var dictionaryFields = typeof(OpcUaDataConnection)
.GetFields(System.Reflection.BindingFlags.Instance |
System.Reflection.BindingFlags.NonPublic |
System.Reflection.BindingFlags.Public)
.Where(f => f.FieldType.IsGenericType &&
f.FieldType.GetGenericTypeDefinition() == typeof(Dictionary<,>))
.Select(f => f.Name)
.ToList();
Assert.True(dictionaryFields.Count == 0,
$"OpcUaDataConnection must not hold a non-concurrent Dictionary<,> field; " +
$"found: {string.Join(", ", dictionaryFields)}. See DCL-019.");
}
}
/// <summary>
/// DataConnectionLayer-012: secure-by-default certificate handling.
/// </summary>
@@ -125,6 +125,65 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable
Assert.True(disable.Success);
}
[Fact]
public async Task SR020_ThreeRapidDeploys_DoNotThrowInvalidActorNameException_LatestWins()
{
// Regression test for SiteRuntime-020. The previous implementation tracked
// pending redeploys by IActorRef (_pendingRedeploys) but had no
// name-keyed shadow, so a third DeployInstanceCommand arriving WHILE the
// first redeploy's predecessor was still terminating saw
// _instanceActors.TryGetValue==false and fell through to
// ApplyDeployment → CreateInstanceActor → Context.ActorOf, which threw
// InvalidActorNameException because the child name was still registered
// until Terminated fires. The supervisor's Stop directive then silently
// dropped the deploy, leaving the deployer waiting forever and the
// persistence Task.Run dangling. After the fix, _terminatingActorsByName
// tracks the in-flight terminator by name; the third deploy overwrites
// the buffered pending command (last-write-wins) and tells the displaced
// sender it was superseded.
var actor = CreateDeploymentManager();
await Task.Delay(500);
// Initial deploy — establishes the running instance.
actor.Tell(new DeployInstanceCommand(
"dep-1", "RapidPump", "h1", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow));
var first = ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
Assert.Equal(DeploymentStatus.Success, first.Status);
await Task.Delay(200);
// Two rapid redeploys before the predecessor has time to fully terminate.
// The second deploy stops the actor (watching it) and buffers itself.
// The third deploy arrives almost immediately and must NOT crash — it
// overwrites the buffered pending command and tells dep-2 it was superseded.
var probe2 = CreateTestProbe();
var probe3 = CreateTestProbe();
actor.Tell(new DeployInstanceCommand(
"dep-2", "RapidPump", "h2", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
probe2.Ref);
actor.Tell(new DeployInstanceCommand(
"dep-3", "RapidPump", "h3", MakeConfigJson("RapidPump"), "admin", DateTimeOffset.UtcNow),
probe3.Ref);
// dep-2 must be told it was superseded; dep-3 must succeed once the
// predecessor finishes terminating.
var superseded = probe2.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
Assert.Equal("dep-2", superseded.DeploymentId);
Assert.Equal(DeploymentStatus.Failed, superseded.Status);
Assert.NotNull(superseded.ErrorMessage);
Assert.Contains("superseded", superseded.ErrorMessage!, StringComparison.OrdinalIgnoreCase);
var winner = probe3.ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(10));
Assert.Equal("dep-3", winner.DeploymentId);
Assert.Equal(DeploymentStatus.Success, winner.Status);
// The instance must still be operable — proves no orphaned actor / no
// half-created child holding the name.
actor.Tell(new DisableInstanceCommand("cmd-1", "RapidPump", DateTimeOffset.UtcNow));
var disable = ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5));
Assert.True(disable.Success);
}
[Fact]
public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances()
{
@@ -438,4 +438,115 @@ public class OperationTrackingStoreTests
Assert.NotNull(await store.GetStatusAsync(bId)); // kept (recent terminal)
Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal)
}
// ── SiteRuntime-024: read/write split + sync-safe Dispose ──────────────
[Fact]
public async Task SR024_ConcurrentReads_DoNotBlockOnInFlightWrite()
{
// Regression test for SiteRuntime-024 (perf half). Pre-fix, every
// GetStatusAsync took the same _gate as RecordTerminalAsync, so a single
// long-running write would queue up every concurrent status query. After
// the fix, reads open a fresh SqliteConnection per call and don't take
// the write gate at all — so they should run concurrently with a write.
//
// The test seeds a row, then issues many parallel reads while a write is
// also in flight. We assert the reads return successfully (a regression
// would either deadlock the test runner or take far longer than the gate
// would have allowed any single read). The actual timing-comparison
// assertion would be flaky in CI; this test asserts only correctness +
// forward progress.
var (store, _) = CreateStore(nameof(SR024_ConcurrentReads_DoNotBlockOnInFlightWrite));
await using (store)
{
var id = TrackedOperationId.New();
await store.RecordEnqueueAsync(
id,
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
// Fire 10 concurrent reads + a write in parallel; all must complete.
var readTasks = Enumerable.Range(0, 10)
.Select(_ => store.GetStatusAsync(id))
.ToArray();
var writeTask = store.RecordAttemptAsync(
id, status: "Retrying", retryCount: 1, lastError: "transient", httpStatus: 503);
await Task.WhenAll(readTasks);
await writeTask;
foreach (var t in readTasks)
{
Assert.NotNull(await t);
}
}
}
[Fact]
public async Task SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread()
{
// Regression test for SiteRuntime-024 (deadlock half). Pre-fix, Dispose
// bridged to async via DisposeAsyncCore().AsTask().GetAwaiter().GetResult()
// — sync-over-async on a SemaphoreSlim can deadlock under a non-reentrant
// SyncContext (host shutdown continuations). Post-fix, Dispose runs
// synchronously without acquiring the gate.
var (store, _) = CreateStore(nameof(SR024_SyncDispose_DoesNotDeadlock_WhenInvokedFromFreshThread));
// Seed a row so the store has live state when disposed.
await store.RecordEnqueueAsync(
TrackedOperationId.New(),
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
var disposeReturned = new TaskCompletionSource<bool>();
var disposeThread = new Thread(() =>
{
try
{
store.Dispose();
disposeReturned.SetResult(true);
}
catch (Exception ex)
{
disposeReturned.SetException(ex);
}
}) { IsBackground = true };
disposeThread.Start();
// 5s ceiling — if Dispose deadlocks, the test fails with TimeoutException.
var completed = await Task.WhenAny(
disposeReturned.Task, Task.Delay(TimeSpan.FromSeconds(5)));
Assert.Same(disposeReturned.Task, completed);
Assert.True(await disposeReturned.Task);
}
[Fact]
public async Task SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent()
{
// The async path must also tolerate Dispose() being called afterwards
// (host shutdown's standard pattern). The _disposeState exchange should
// short-circuit the second call.
var (store, _) = CreateStore(nameof(SR024_AsyncDispose_DoesNotDeadlock_AndIsIdempotent));
await store.RecordEnqueueAsync(
TrackedOperationId.New(),
kind: "ApiCallCached",
targetSummary: "ERP.GetOrder",
sourceInstanceId: null,
sourceScript: null,
sourceNode: "node-a");
await store.DisposeAsync();
// Second call must be a no-op, not throw.
store.Dispose();
// And a third async — also a no-op.
await store.DisposeAsync();
}
}