fix(data-connection-layer): resolve DataConnectionLayer-002/003/004/005 — Resume supervision, concurrent dicts, subscribe-failure classification, write timeout

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent d7630d80fe
commit fccd3274d3
7 changed files with 350 additions and 25 deletions

View File

@@ -512,6 +512,106 @@ public class DataConnectionActorTests : TestKit
Assert.Equal(instances * tagsPerInstance, report.ResolvedTags);
}
// ── DataConnectionLayer-004: subscribe-time failure classification ──
[Fact]
public async Task DCL004_GenuineTagResolutionFailure_PushesBadQualityToSubscriber()
{
// Regression test for DataConnectionLayer-004. When a tag genuinely fails to
// resolve at subscribe time, the design doc (Tag Path Resolution, step 2)
// requires the attribute to be marked quality `bad`. The pre-fix code only
// logged and added the tag to _unresolvedTags — the Instance Actor never got
// a signal. After the fix, a bad-quality TagValueUpdate is pushed.
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
// Genuine node-not-found: a non-connection exception.
_mockAdapter.SubscribeAsync("missing/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(Task.FromException<string>(new KeyNotFoundException("node not found")));
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl004-bad-quality");
await Task.Delay(300);
actor.Tell(new SubscribeTagsRequest(
"c1", "inst1", "dcl004-bad-quality", ["missing/tag"], DateTimeOffset.UtcNow));
// Two messages arrive: the subscribe ack and a bad-quality update for the tag.
var bad = ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
Assert.Equal("missing/tag", bad.TagPath);
Assert.Equal(QualityCode.Bad, bad.Quality);
var ack = ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
Assert.True(ack.Success);
}
[Fact]
public async Task DCL004_ConnectionLevelSubscribeFailure_TriggersReconnect_NotTagRetry()
{
// Regression test for DataConnectionLayer-004. A subscribe failing because the
// adapter is not connected (InvalidOperationException from EnsureConnected) is
// a connection problem, not a bad tag path. The pre-fix code misclassified it
// as an unresolved tag and retried it on the 10s tag-resolution timer. After
// the fix it drives the reconnection state machine instead.
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns(Task.FromException<string>(
new InvalidOperationException("OPC UA client is not connected.")));
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
var actor = CreateConnectionActor("dcl004-conn-level");
await Task.Delay(300);
actor.Tell(new SubscribeTagsRequest(
"c1", "inst1", "dcl004-conn-level", ["some/tag"], DateTimeOffset.UtcNow));
// The connection-level failure must drive the actor into Reconnecting, which
// re-attempts ConnectAsync. Pre-fix the actor stayed Connected and only armed
// the tag-resolution timer, so ConnectAsync is called exactly once.
AwaitCondition(() =>
_mockAdapter.ReceivedCalls().Count(c => c.GetMethodInfo().Name == "ConnectAsync") >= 2,
TimeSpan.FromSeconds(5));
}
// ── DataConnectionLayer-005: WriteTimeout must bound a hung write ──
[Fact]
public async Task DCL005_Write_ThatHangs_TimesOutAndReturnsFailureSynchronously()
{
// Regression test for DataConnectionLayer-005. HandleWrite called WriteAsync
// with no CancellationToken and no timeout, so a hung device write never
// produced a WriteTagResponse. The calling script would block until its own
// Ask-timeout with no DCL-level error. After the fix, _options.WriteTimeout
// bounds the write and a timeout is surfaced as a failed WriteTagResponse.
_options.WriteTimeout = TimeSpan.FromMilliseconds(300);
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
// WriteAsync never completes unless its cancellation token fires.
_mockAdapter.WriteAsync("tag1", 42, Arg.Any<CancellationToken>())
.Returns(ci =>
{
var ct = ci.Arg<CancellationToken>();
var tcs = new TaskCompletionSource<WriteResult>();
ct.Register(() => tcs.TrySetCanceled(ct));
return tcs.Task;
});
var actor = CreateConnectionActor("dcl005-write-timeout");
await Task.Delay(300); // reach Connected state
actor.Tell(new WriteTagRequest("corr1", "dcl005-write-timeout", "tag1", 42, DateTimeOffset.UtcNow));
var response = ExpectMsg<WriteTagResponse>(TimeSpan.FromSeconds(3));
Assert.False(response.Success);
Assert.Contains("timeout", response.ErrorMessage, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public async Task DCL001_SubscribeWithFailedTags_CountsResolvedAndUnresolvedSeparately()
{
@@ -533,7 +633,11 @@ public class DataConnectionActorTests : TestKit
actor.Tell(new SubscribeTagsRequest(
"c1", "inst1", "dcl001-failed-tags",
["good/a", "good/b", "good/c", "bad/x", "bad/y"], DateTimeOffset.UtcNow));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
// Two genuine resolution failures now also push a bad-quality TagValueUpdate
// to the subscriber (DataConnectionLayer-004); skip past those to the ack.
var ack = FishForMessage<SubscribeTagsResponse>(_ => true, TimeSpan.FromSeconds(5));
Assert.True(ack.Success);
actor.Tell(new DataConnectionActor.GetHealthReport());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(3));

View File

@@ -3,6 +3,7 @@ using Akka.TestKit.Xunit2;
using NSubstitute;
using ScadaLink.Commons.Interfaces.Protocol;
using ScadaLink.Commons.Messages.DataConnection;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.DataConnectionLayer.Actors;
using ScadaLink.HealthMonitoring;
@@ -57,6 +58,52 @@ public class DataConnectionManagerActorTests : TestKit
Assert.Contains("Unknown connection", response.ErrorMessage);
}
[Fact]
public async Task DCL002_ConnectionActorCrash_PreservesSubscriptionState()
{
// Regression test for DataConnectionLayer-002. The supervisor used
// Directive.Restart, which discards the connection actor's in-memory
// subscription registry — breaking the design doc's "transparent
// re-subscribe" guarantee (subscribers are never re-subscribed and sit at
// stale quality forever). After the fix the supervisor uses Resume, which
// keeps the actor instance and its state across a transient exception.
var mockAdapter = Substitute.For<IDataConnection>();
mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
.Returns(Task.CompletedTask);
mockAdapter.Status.Returns(ConnectionHealth.Connected);
mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
.Returns("sub-001");
mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
.Returns(new ReadResult(false, null, null));
// A write throws synchronously, escaping the message handler and crashing
// the connection actor — exercising the supervisor strategy.
mockAdapter.WriteAsync(Arg.Any<string>(), Arg.Any<object?>(), Arg.Any<CancellationToken>())
.Returns<Task<WriteResult>>(_ => throw new InvalidOperationException("boom"));
_mockFactory.Create("OpcUa", Arg.Any<IDictionary<string, string>>()).Returns(mockAdapter);
var manager = Sys.ActorOf(Props.Create(() =>
new DataConnectionManagerActor(_mockFactory, _options, _mockHealthCollector)));
manager.Tell(new CreateConnectionCommand("conn1", "OpcUa", new Dictionary<string, string>(), null, 3));
await Task.Delay(300); // connection actor reaches Connected
// Register a subscription.
manager.Tell(new SubscribeTagsRequest("c1", "inst1", "conn1", ["tag1"], DateTimeOffset.UtcNow));
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(3));
// Crash the connection actor via a synchronously-throwing write.
manager.Tell(new WriteTagRequest("c2", "conn1", "tag1", 42, DateTimeOffset.UtcNow));
await Task.Delay(300); // supervisor handles the failure
// After the crash the subscription state must survive: the health report
// still shows the subscribed/resolved tag. With Restart it would be 0.
manager.Tell(new GetAllHealthReports());
var report = ExpectMsg<DataConnectionHealthReport>(TimeSpan.FromSeconds(3));
Assert.Equal(1, report.TotalSubscribedTags);
Assert.Equal(1, report.ResolvedTags);
}
[Fact]
public void CreateConnection_UsesFactory()
{

View File

@@ -6,6 +6,37 @@ using ScadaLink.DataConnectionLayer.Adapters;
namespace ScadaLink.DataConnectionLayer.Tests;
/// <summary>
/// DataConnectionLayer-003: structural regression guard. RealOpcUaClient's
/// monitored-item / callback maps are read from the OPC UA SDK's publish threads
/// concurrently with subscribe/disconnect mutations on other threads. They must be
/// concurrent collections, not plain Dictionary. This is verified structurally
/// because RealOpcUaClient wraps concrete OPC Foundation SDK types and cannot be
/// exercised without a live OPC UA server.
/// </summary>
public class RealOpcUaClientThreadSafetyTests
{
[Theory]
[InlineData("_callbacks")]
[InlineData("_monitoredItems")]
public void DCL003_SharedDictionaryFields_AreConcurrentCollections(string fieldName)
{
var field = typeof(RealOpcUaClient)
.GetField(fieldName,
System.Reflection.BindingFlags.Instance |
System.Reflection.BindingFlags.NonPublic);
Assert.NotNull(field);
var fieldType = field!.FieldType;
Assert.True(
fieldType.IsGenericType &&
fieldType.GetGenericTypeDefinition() == typeof(System.Collections.Concurrent.ConcurrentDictionary<,>),
$"RealOpcUaClient.{fieldName} must be a ConcurrentDictionary<,> for thread safety, " +
$"but was {fieldType.Name}.");
}
}
/// <summary>
/// WP-7: Tests for OPC UA adapter.
/// </summary>