fix(opcuaclient): re-resolve session inside _gate in history/read paths (stale-session race)
This commit is contained in:
@@ -82,6 +82,15 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
/// <summary>Per-connection gate. PRs 67+ serialize read/write/browse on this.</summary>
|
||||
internal SemaphoreSlim Gate => _gate;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only seam to swap the active <see cref="Session"/> the way
|
||||
/// <see cref="OnReconnectComplete"/> does at runtime, so the stale-session race the
|
||||
/// read/history paths guard against (re-read Session inside <c>_gate</c>, not before)
|
||||
/// can be exercised deterministically without standing up a live reconnect handler.
|
||||
/// Production code never calls this — the SDK reconnect handler owns the real swap.
|
||||
/// </summary>
|
||||
internal void SetSessionForTest(ISession? session) => Session = session;
|
||||
|
||||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||||
private bool _disposed;
|
||||
/// <summary>URL of the endpoint the driver actually connected to. Exposed via <see cref="HostName"/>.</summary>
|
||||
@@ -1131,7 +1140,10 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
public async Task<ISubscriptionHandle> SubscribeAsync(
|
||||
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
|
||||
{
|
||||
var session = RequireSession();
|
||||
// Fast-fail before queuing if the driver isn't connected, but do NOT bind the
|
||||
// subscription to this reference — a reconnect can swap Session while we wait on _gate.
|
||||
// The session actually used is re-read inside the gate (Driver.OpcUaClient-001/-006).
|
||||
_ = RequireSession();
|
||||
var id = Interlocked.Increment(ref _nextSubscriptionId);
|
||||
var handle = new OpcUaSubscriptionHandle(id);
|
||||
|
||||
@@ -1157,6 +1169,11 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-read Session inside the critical section: a reconnect completing while we were
|
||||
// blocked on _gate has already swapped in the new session via OnReconnectComplete.
|
||||
var session = Session
|
||||
?? throw new InvalidOperationException(
|
||||
"OpcUaClient session was lost before the subscription could be created.");
|
||||
session.AddSubscription(subscription);
|
||||
await subscription.CreateAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
@@ -1296,7 +1313,10 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
public async Task<IAlarmSubscriptionHandle> SubscribeAlarmsAsync(
|
||||
IReadOnlyList<string> sourceNodeIds, CancellationToken cancellationToken)
|
||||
{
|
||||
var session = RequireSession();
|
||||
// Fast-fail before queuing if the driver isn't connected, but do NOT bind the alarm
|
||||
// subscription to this reference — a reconnect can swap Session while we wait on _gate.
|
||||
// The session actually used is re-read inside the gate (Driver.OpcUaClient-001/-006).
|
||||
_ = RequireSession();
|
||||
var id = Interlocked.Increment(ref _nextAlarmSubscriptionId);
|
||||
var handle = new OpcUaAlarmSubscriptionHandle(id);
|
||||
|
||||
@@ -1345,6 +1365,11 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-read Session inside the critical section: a reconnect completing while we were
|
||||
// blocked on _gate has already swapped in the new session via OnReconnectComplete.
|
||||
var session = Session
|
||||
?? throw new InvalidOperationException(
|
||||
"OpcUaClient session was lost before the alarm subscription could be created.");
|
||||
session.AddSubscription(subscription);
|
||||
await subscription.CreateAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
@@ -1410,31 +1435,42 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
// list without guarding the size themselves — e.g. a bulk-ack UI that built an empty
|
||||
// list because the filter matched nothing.
|
||||
if (acknowledgements.Count == 0) return;
|
||||
var session = RequireSession();
|
||||
|
||||
// OPC UA A&C: call the AcknowledgeableConditionType.Acknowledge method on each
|
||||
// condition node with EventId + Comment arguments. CallAsync accepts a batch —
|
||||
// one CallMethodRequest per ack.
|
||||
var callRequests = new CallMethodRequestCollection();
|
||||
foreach (var ack in acknowledgements)
|
||||
{
|
||||
if (!TryParseNodeId(session, ack.ConditionId, out var conditionId)) continue;
|
||||
callRequests.Add(new CallMethodRequest
|
||||
{
|
||||
ObjectId = conditionId,
|
||||
MethodId = MethodIds.AcknowledgeableConditionType_Acknowledge,
|
||||
InputArguments = [
|
||||
new Variant(Array.Empty<byte>()), // EventId — server-side best-effort; empty resolves to 'most recent'
|
||||
new Variant(new LocalizedText(ack.Comment ?? string.Empty)),
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
if (callRequests.Count == 0) return;
|
||||
// Fast-fail before queuing if the driver isn't connected, but do NOT bind the ack calls
|
||||
// (or the namespace-relative ConditionId parse) to this reference — a reconnect can swap
|
||||
// Session while we wait on _gate. The session actually used is re-read inside the gate
|
||||
// (Driver.OpcUaClient-001/-006).
|
||||
_ = RequireSession();
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-read Session inside the critical section: a reconnect completing while we were
|
||||
// blocked on _gate has already swapped in the new session via OnReconnectComplete.
|
||||
// ConditionId parsing is namespace-relative, so it must use the current session's
|
||||
// namespace table too.
|
||||
var session = Session;
|
||||
if (session is null) return;
|
||||
|
||||
// OPC UA A&C: call the AcknowledgeableConditionType.Acknowledge method on each
|
||||
// condition node with EventId + Comment arguments. CallAsync accepts a batch —
|
||||
// one CallMethodRequest per ack.
|
||||
var callRequests = new CallMethodRequestCollection();
|
||||
foreach (var ack in acknowledgements)
|
||||
{
|
||||
if (!TryParseNodeId(session, ack.ConditionId, out var conditionId)) continue;
|
||||
callRequests.Add(new CallMethodRequest
|
||||
{
|
||||
ObjectId = conditionId,
|
||||
MethodId = MethodIds.AcknowledgeableConditionType_Acknowledge,
|
||||
InputArguments = [
|
||||
new Variant(Array.Empty<byte>()), // EventId — server-side best-effort; empty resolves to 'most recent'
|
||||
new Variant(new LocalizedText(ack.Comment ?? string.Empty)),
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
if (callRequests.Count == 0) return;
|
||||
|
||||
try
|
||||
{
|
||||
var resp = await session.CallAsync(
|
||||
@@ -1615,20 +1651,30 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
private async Task<Core.Abstractions.HistoryReadResult> ExecuteHistoryReadAsync(
|
||||
string fullReference, ExtensionObject historyReadDetails, CancellationToken ct)
|
||||
{
|
||||
var session = RequireSession();
|
||||
if (!TryParseNodeId(session, fullReference, out var nodeId))
|
||||
{
|
||||
return new Core.Abstractions.HistoryReadResult([], null);
|
||||
}
|
||||
|
||||
var nodesToRead = new HistoryReadValueIdCollection
|
||||
{
|
||||
new HistoryReadValueId { NodeId = nodeId },
|
||||
};
|
||||
// Make sure a session exists before queuing on the gate, but do NOT bind the wire call
|
||||
// (or the namespace-relative NodeId parse) to this reference — a reconnect can swap
|
||||
// Session while we wait on _gate. The session actually used is re-read inside the gate
|
||||
// (Driver.OpcUaClient-001/-006).
|
||||
_ = RequireSession();
|
||||
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-read Session inside the critical section: if a reconnect completed while we
|
||||
// were blocked on _gate, OnReconnectComplete has already swapped in the new session.
|
||||
// NodeId parsing is namespace-relative, so it must also use the current session's
|
||||
// namespace table.
|
||||
var session = Session;
|
||||
if (session is null || !TryParseNodeId(session, fullReference, out var nodeId))
|
||||
{
|
||||
return new Core.Abstractions.HistoryReadResult([], null);
|
||||
}
|
||||
|
||||
var nodesToRead = new HistoryReadValueIdCollection
|
||||
{
|
||||
new HistoryReadValueId { NodeId = nodeId },
|
||||
};
|
||||
|
||||
var resp = await session.HistoryReadAsync(
|
||||
requestHeader: null,
|
||||
historyReadDetails: historyReadDetails,
|
||||
@@ -1785,38 +1831,47 @@ public sealed class OpcUaClientDriver : IDriver, ITagDiscovery, IReadable, IWrit
|
||||
string? sourceName, DateTime startUtc, DateTime endUtc, int maxEvents,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var session = RequireSession();
|
||||
|
||||
NodeId notifierNodeId;
|
||||
if (string.IsNullOrEmpty(sourceName))
|
||||
{
|
||||
notifierNodeId = ObjectIds.Server;
|
||||
}
|
||||
else if (!TryParseNodeId(session, sourceName, out var parsed))
|
||||
{
|
||||
return new Core.Abstractions.HistoricalEventsResult([], null);
|
||||
}
|
||||
else
|
||||
{
|
||||
notifierNodeId = parsed;
|
||||
}
|
||||
|
||||
var details = new ReadEventDetails
|
||||
{
|
||||
StartTime = startUtc,
|
||||
EndTime = endUtc,
|
||||
NumValuesPerNode = maxEvents <= 0 ? 0u : (uint)maxEvents,
|
||||
Filter = BuildBaseEventFilter(),
|
||||
};
|
||||
|
||||
var nodesToRead = new HistoryReadValueIdCollection
|
||||
{
|
||||
new HistoryReadValueId { NodeId = notifierNodeId },
|
||||
};
|
||||
// Confirm a session exists before queuing; the session actually used — and the
|
||||
// namespace-relative source-NodeId parse — are re-read inside the gate so a reconnect
|
||||
// mid-wait can't leave us reading from a closed session (Driver.OpcUaClient-001/-006).
|
||||
_ = RequireSession();
|
||||
|
||||
await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Re-read Session inside the critical section: OnReconnectComplete may have swapped
|
||||
// in a new session while we were blocked on _gate, and the source-NodeId parse is
|
||||
// namespace-relative so it must bind against the current session's namespace table.
|
||||
var session = Session;
|
||||
if (session is null) return new Core.Abstractions.HistoricalEventsResult([], null);
|
||||
|
||||
NodeId notifierNodeId;
|
||||
if (string.IsNullOrEmpty(sourceName))
|
||||
{
|
||||
notifierNodeId = ObjectIds.Server;
|
||||
}
|
||||
else if (!TryParseNodeId(session, sourceName, out var parsed))
|
||||
{
|
||||
return new Core.Abstractions.HistoricalEventsResult([], null);
|
||||
}
|
||||
else
|
||||
{
|
||||
notifierNodeId = parsed;
|
||||
}
|
||||
|
||||
var details = new ReadEventDetails
|
||||
{
|
||||
StartTime = startUtc,
|
||||
EndTime = endUtc,
|
||||
NumValuesPerNode = maxEvents <= 0 ? 0u : (uint)maxEvents,
|
||||
Filter = BuildBaseEventFilter(),
|
||||
};
|
||||
|
||||
var nodesToRead = new HistoryReadValueIdCollection
|
||||
{
|
||||
new HistoryReadValueId { NodeId = notifierNodeId },
|
||||
};
|
||||
|
||||
var resp = await session.HistoryReadAsync(
|
||||
requestHeader: null,
|
||||
historyReadDetails: new ExtensionObject(details),
|
||||
|
||||
+161
@@ -0,0 +1,161 @@
|
||||
using Moq;
|
||||
using Opc.Ua;
|
||||
using Opc.Ua.Client;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.OpcUaClient.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Regression coverage for the stale-session concurrency race in the history/read wire
|
||||
/// paths. A background reconnect handler can swap <see cref="OpcUaClientDriver.Session"/>
|
||||
/// while a caller is parked on <c>_gate.WaitAsync</c>. If a method captures the session
|
||||
/// reference <i>before</i> the gate and uses it afterwards, it issues the wire call against
|
||||
/// the closed/stale session. The correct idiom — already used by ReadAsync/WriteAsync — is
|
||||
/// a fast-fail guard outside the gate plus an authoritative re-read of <c>Session</c>
|
||||
/// <i>inside</i> the gate. These tests prove the history/read funnels follow it.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The swap is staged so the race is exercised, not raced against:
|
||||
/// <list type="number">
|
||||
/// <item>The test acquires <c>_gate</c> first, so the method-under-test physically
|
||||
/// cannot enter the critical section (read Session-in-gate / issue the wire call) until
|
||||
/// the test releases the gate. Whatever the implementation does <i>after</i> the gate
|
||||
/// therefore always sees the post-swap session.</item>
|
||||
/// <item>The only thing that must be ordered is the <b>buggy</b> capture-before-gate:
|
||||
/// its pre-gate NodeId parse reads the OLD session's <c>MessageContext</c>, which
|
||||
/// completes a <see cref="TaskCompletionSource"/>. We wait on that signal so the swap
|
||||
/// lands <i>after</i> the buggy capture has grabbed the OLD session — making the RED
|
||||
/// outcome deterministic, not timing-dependent.</item>
|
||||
/// <item>The fixed implementation reads no session member before the gate, so the
|
||||
/// signal never fires; a bounded fallback then swaps+releases. That is correct for the
|
||||
/// fixed path because its session read happens only after we release the gate, so the
|
||||
/// swap is already visible regardless of ordering.</item>
|
||||
/// </list>
|
||||
/// A correct implementation re-reads <c>Session</c> inside the gate and lands the wire call
|
||||
/// on the NEW session; the buggy capture-before-gate code lands it on the OLD one.
|
||||
/// </remarks>
|
||||
[Trait("Category", "Unit")]
|
||||
public sealed class OpcUaClientStaleSessionRaceTests
|
||||
{
|
||||
private static Mock<ISession> NewSessionMock(TaskCompletionSource? prologueReached = null)
|
||||
{
|
||||
var resp = new HistoryReadResponse
|
||||
{
|
||||
ResponseHeader = new ResponseHeader(),
|
||||
Results = new HistoryReadResultCollection(),
|
||||
DiagnosticInfos = new DiagnosticInfoCollection(),
|
||||
};
|
||||
|
||||
var mock = new Mock<ISession>(MockBehavior.Loose);
|
||||
// NodeId parsing inside the driver resolves the reference against the session's
|
||||
// MessageContext. This getter is read in the driver's synchronous prologue (before the
|
||||
// gate), so signalling here proves the pre-gate capture path has run.
|
||||
mock.SetupGet(s => s.MessageContext)
|
||||
.Returns(new ServiceMessageContext())
|
||||
.Callback(() => prologueReached?.TrySetResult());
|
||||
mock.SetupGet(s => s.NamespaceUris).Returns(new NamespaceTable());
|
||||
mock.Setup(s => s.HistoryReadAsync(
|
||||
It.IsAny<RequestHeader>(),
|
||||
It.IsAny<ExtensionObject>(),
|
||||
It.IsAny<TimestampsToReturn>(),
|
||||
It.IsAny<bool>(),
|
||||
It.IsAny<HistoryReadValueIdCollection>(),
|
||||
It.IsAny<CancellationToken>()))
|
||||
.ReturnsAsync(resp);
|
||||
return mock;
|
||||
}
|
||||
|
||||
private static void VerifyWireHitNewSessionNotOld(Mock<ISession> newSession, Mock<ISession> oldSession)
|
||||
{
|
||||
newSession.Verify(s => s.HistoryReadAsync(
|
||||
It.IsAny<RequestHeader>(), It.IsAny<ExtensionObject>(), It.IsAny<TimestampsToReturn>(),
|
||||
It.IsAny<bool>(), It.IsAny<HistoryReadValueIdCollection>(), It.IsAny<CancellationToken>()),
|
||||
Times.Once);
|
||||
oldSession.Verify(s => s.HistoryReadAsync(
|
||||
It.IsAny<RequestHeader>(), It.IsAny<ExtensionObject>(), It.IsAny<TimestampsToReturn>(),
|
||||
It.IsAny<bool>(), It.IsAny<HistoryReadValueIdCollection>(), It.IsAny<CancellationToken>()),
|
||||
Times.Never);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// ExecuteHistoryReadAsync (the ReadRaw/ReadProcessed/ReadAtTime funnel) must issue the
|
||||
/// HistoryRead against the session current <i>after</i> the gate, not a reference
|
||||
/// captured before WaitAsync.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ReadRawAsync_uses_session_swapped_in_across_the_gate_not_the_captured_one()
|
||||
{
|
||||
var ct = TestContext.Current.CancellationToken;
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-stale-raw");
|
||||
|
||||
var prologueReached = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
var oldSession = NewSessionMock(prologueReached);
|
||||
var newSession = NewSessionMock();
|
||||
drv.SetSessionForTest(oldSession.Object);
|
||||
|
||||
// Hold the gate so the method parks on WaitAsync after running its synchronous prologue.
|
||||
await drv.Gate.WaitAsync(ct);
|
||||
|
||||
var call = drv.ReadRawAsync(
|
||||
"ns=2;s=Counter", DateTime.UtcNow.AddMinutes(-5), DateTime.UtcNow, 1000, ct);
|
||||
|
||||
// Wait until the buggy capture-before-gate has grabbed the OLD session (signalled via
|
||||
// its MessageContext read), then stage the reconnect swap and release the gate. The
|
||||
// fixed path reads no pre-gate session member, so the signal never fires and the
|
||||
// bounded fallback elapses — harmless, because the fixed read happens only after the
|
||||
// gate is released and therefore always sees the swapped session.
|
||||
await WaitForPreGateCaptureOrFallbackAsync(prologueReached.Task, ct);
|
||||
drv.SetSessionForTest(newSession.Object);
|
||||
drv.Gate.Release();
|
||||
|
||||
await call;
|
||||
|
||||
VerifyWireHitNewSessionNotOld(newSession, oldSession);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// ReadEventsAsync is a separate HistoryRead site with the same hazard — it must also
|
||||
/// re-read Session inside the gate rather than use the pre-gate capture.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ReadEventsAsync_uses_session_swapped_in_across_the_gate_not_the_captured_one()
|
||||
{
|
||||
var ct = TestContext.Current.CancellationToken;
|
||||
using var drv = new OpcUaClientDriver(new OpcUaClientDriverOptions(), "opcua-stale-events");
|
||||
|
||||
var prologueReached = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
// ReadEventsAsync only parses the source NodeId when sourceName is non-empty, so pass a
|
||||
// concrete source id to force the prologue's MessageContext read (and its signal).
|
||||
var oldSession = NewSessionMock(prologueReached);
|
||||
var newSession = NewSessionMock();
|
||||
drv.SetSessionForTest(oldSession.Object);
|
||||
|
||||
await drv.Gate.WaitAsync(ct);
|
||||
|
||||
var call = drv.ReadEventsAsync(
|
||||
sourceName: "ns=2;s=Pump17", startUtc: DateTime.UtcNow.AddMinutes(-5),
|
||||
endUtc: DateTime.UtcNow, maxEvents: 100, cancellationToken: ct);
|
||||
|
||||
await WaitForPreGateCaptureOrFallbackAsync(prologueReached.Task, ct);
|
||||
drv.SetSessionForTest(newSession.Object);
|
||||
drv.Gate.Release();
|
||||
|
||||
await call;
|
||||
|
||||
VerifyWireHitNewSessionNotOld(newSession, oldSession);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Awaits the buggy path's pre-gate-capture signal, or a bounded fallback when it never
|
||||
/// fires (the fixed path performs no pre-gate session member read). Generous so the
|
||||
/// buggy-path signal effectively always wins first, keeping the RED outcome
|
||||
/// deterministic; the fallback only matters for the GREEN (fixed) path where swap
|
||||
/// ordering is irrelevant.
|
||||
/// </summary>
|
||||
private static async Task WaitForPreGateCaptureOrFallbackAsync(Task signal, CancellationToken ct)
|
||||
{
|
||||
var fallback = Task.Delay(TimeSpan.FromSeconds(2), ct);
|
||||
await Task.WhenAny(signal, fallback);
|
||||
}
|
||||
}
|
||||
+1
@@ -12,6 +12,7 @@
|
||||
<ItemGroup>
|
||||
<PackageReference Include="xunit.v3"/>
|
||||
<PackageReference Include="Shouldly"/>
|
||||
<PackageReference Include="Moq"/>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk"/>
|
||||
<PackageReference Include="xunit.runner.visualstudio">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
|
||||
Reference in New Issue
Block a user