fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR

CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL
IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the
at-least-once livelock on the site→central notification handoff.

DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent
same-tag subscribes don't orphan an adapter handle; delete the latent
dead _subscriptionHandles dictionary; stop double-counting
_totalSubscribed when an unresolved tag is promoted via another instance;
release adapter handles on mid-flight unsubscribe; gate the
tag-resolution retry timer with IsTimerActive so subscribe bursts don't
reset it into starvation.

SR-020: add _terminatingActorsByName shadow so a third deploy arriving
during a pending redeploy doesn't crash on InvalidActorNameException —
displaced senders get a Failed/superseded response and the latest
command wins on Terminated.

SR-024: split OperationTrackingStore reads from writes (fresh
SqliteConnection per GetStatusAsync) so long writes don't block status
queries; rewrite Dispose to drop the sync-over-async bridge that could
deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the
dispose-once flag race-safe across both paths.
This commit is contained in:
Joseph Doherty
2026-05-28 05:20:13 -04:00
parent 5d2386cc9d
commit f936f55f51
15 changed files with 1152 additions and 170 deletions
@@ -39,6 +39,42 @@ public class RealOpcUaClientThreadSafetyTests
}
}
/// <summary>
/// DataConnectionLayer-019: <see cref="OpcUaDataConnection"/> previously kept a
/// dead <c>Dictionary&lt;string,string&gt; _subscriptionHandles</c> field that was
/// written and removed across thread-pool continuations but never read. Plain
/// Dictionary writes from concurrent post-await continuations are racy; the
/// field was a latent bug waiting for any future reader. The fix deletes the
/// field rather than converting it to ConcurrentDictionary (bookkeeping already
/// lives in <c>RealOpcUaClient._monitoredItems/_callbacks</c> and
/// <c>DataConnectionActor._subscriptionIds</c>). This test guards against
/// regression — anyone re-introducing a non-concurrent shared dictionary on
/// the adapter must justify it explicitly.
/// </summary>
public class OpcUaDataConnectionThreadSafetyTests
{
[Fact]
public void DCL019_OpcUaDataConnection_HasNoNonConcurrentSharedDictionary()
{
// Reflection-walk every instance field on the adapter. Any
// System.Collections.Generic.Dictionary<,> field would be a regression:
// either dead state (return it) or live state mutated from continuations
// (convert to ConcurrentDictionary). Either way, fail the test.
var dictionaryFields = typeof(OpcUaDataConnection)
.GetFields(System.Reflection.BindingFlags.Instance |
System.Reflection.BindingFlags.NonPublic |
System.Reflection.BindingFlags.Public)
.Where(f => f.FieldType.IsGenericType &&
f.FieldType.GetGenericTypeDefinition() == typeof(Dictionary<,>))
.Select(f => f.Name)
.ToList();
Assert.True(dictionaryFields.Count == 0,
$"OpcUaDataConnection must not hold a non-concurrent Dictionary<,> field; " +
$"found: {string.Join(", ", dictionaryFields)}. See DCL-019.");
}
}
/// <summary>
/// DataConnectionLayer-012: secure-by-default certificate handling.
/// </summary>