fix(lmxproxy): resolve subscribe/unsubscribe race condition on client reconnect

Three fixes for the SubscriptionManager/MxAccessClient subscription pipeline:

1. Serialize Subscribe and UnsubscribeClient with a SemaphoreSlim gate to prevent
   race where old-session unsubscribe removes new-session COM subscriptions.
   CreateMxAccessSubscriptionsAsync is now awaited instead of fire-and-forget.

2. Fix dual VTQ delivery in MxAccessClient.OnDataChange — each update was delivered
   twice (once via stored callback, once via OnTagValueChanged property). Now uses
   stored callback as the single delivery path.

3. Store pending tag addresses when CreateMxAccessSubscriptionsAsync fails (MxAccess
   down) and retry them on reconnect via NotifyReconnection/RetryPendingSubscriptionsAsync.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-03-22 16:33:19 -04:00
parent bc4fc97652
commit fa33e1acf1
5 changed files with 309 additions and 110 deletions

View File

@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Channels;
using System.Threading.Tasks;
@@ -32,11 +33,34 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
Task.FromResult((false, 0));
public Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default) =>
Task.FromResult(ProbeResult.Healthy(Quality.Good, DateTime.UtcNow));
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses) => Task.CompletedTask;
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default) =>
Task.FromResult<IAsyncDisposable>(new FakeSubscriptionHandle());
public ValueTask DisposeAsync() => default;
// Track subscribe/unsubscribe calls for assertions
public List<List<string>> SubscribeCalls { get; } = new List<List<string>>();
public List<List<string>> UnsubscribeCalls { get; } = new List<List<string>>();
public List<Action<string, Vtq>> StoredCallbacks { get; } = new List<Action<string, Vtq>>();
// When true, SubscribeAsync throws to simulate MxAccess being down
public bool FailSubscriptions { get; set; }
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses)
{
UnsubscribeCalls.Add(addresses.ToList());
return Task.CompletedTask;
}
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default)
{
var addressList = addresses.ToList();
SubscribeCalls.Add(addressList);
StoredCallbacks.Add(callback);
if (FailSubscriptions)
throw new InvalidOperationException("Not connected to MxAccess");
return Task.FromResult<IAsyncDisposable>(new FakeSubscriptionHandle());
}
// Suppress unused event warning
internal void FireEvent() => ConnectionStateChanged?.Invoke(this, null!);
@@ -47,11 +71,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void Subscribe_ReturnsChannelReader()
public async Task Subscribe_ReturnsChannelReader()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Tag1", "Tag2" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Tag1", "Tag2" }, cts.Token);
reader.Should().NotBeNull();
}
@@ -60,7 +84,7 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
var vtq = Vtq.Good(42.0);
sm.OnTagValueChanged("Motor.Speed", vtq);
@@ -76,8 +100,8 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader1 = sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
var reader2 = sm.Subscribe("client2", new[] { "Motor.Speed" }, cts.Token);
var reader1 = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
var reader2 = await sm.SubscribeAsync("client2", new[] { "Motor.Speed" }, cts.Token);
sm.OnTagValueChanged("Motor.Speed", Vtq.Good(99.0));
@@ -88,11 +112,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void OnTagValueChanged_NonSubscribedTag_NoDelivery()
public async Task OnTagValueChanged_NonSubscribedTag_NoDelivery()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
sm.OnTagValueChanged("Motor.Torque", Vtq.Good(10.0));
@@ -101,11 +125,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void UnsubscribeClient_CompletesChannel()
public async Task UnsubscribeClient_CompletesChannel()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
sm.UnsubscribeClient("client1");
@@ -114,11 +138,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void UnsubscribeClient_RemovesFromTagSubscriptions()
public async Task UnsubscribeClient_RemovesFromTagSubscriptions()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
sm.UnsubscribeClient("client1");
@@ -128,12 +152,12 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void RefCounting_LastClientUnsubscribeRemovesTag()
public async Task RefCounting_LastClientUnsubscribeRemovesTag()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
sm.Subscribe("client2", new[] { "Motor.Speed" }, cts.Token);
await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
await sm.SubscribeAsync("client2", new[] { "Motor.Speed" }, cts.Token);
sm.GetStats().TotalTags.Should().Be(1);
@@ -145,11 +169,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void NotifyDisconnection_SendsBadQualityToAll()
public async Task NotifyDisconnection_SendsBadQualityToAll()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Motor.Speed", "Motor.Torque" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed", "Motor.Torque" }, cts.Token);
sm.NotifyDisconnection();
@@ -161,11 +185,11 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void Backpressure_DropOldest_DropsWhenFull()
public async Task Backpressure_DropOldest_DropsWhenFull()
{
using var sm = new SubscriptionManager(new FakeScadaClient(), channelCapacity: 3);
using var cts = new CancellationTokenSource();
var reader = sm.Subscribe("client1", new[] { "Motor.Speed" }, cts.Token);
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
// Fill the channel beyond capacity
for (int i = 0; i < 10; i++)
@@ -180,17 +204,123 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
}
[Fact]
public void GetStats_ReturnsCorrectCounts()
public async Task GetStats_ReturnsCorrectCounts()
{
using var sm = new SubscriptionManager(new FakeScadaClient());
using var cts = new CancellationTokenSource();
sm.Subscribe("c1", new[] { "Tag1", "Tag2" }, cts.Token);
sm.Subscribe("c2", new[] { "Tag2", "Tag3" }, cts.Token);
await sm.SubscribeAsync("c1", new[] { "Tag1", "Tag2" }, cts.Token);
await sm.SubscribeAsync("c2", new[] { "Tag2", "Tag3" }, cts.Token);
var stats = sm.GetStats();
stats.TotalClients.Should().Be(2);
stats.TotalTags.Should().Be(3); // Tag1, Tag2, Tag3
stats.ActiveSubscriptions.Should().Be(4); // c1:Tag1, c1:Tag2, c2:Tag2, c2:Tag3
}
// ── New tests for race condition fix ──────────────────────────
[Fact]
public async Task SubscribeAfterUnsubscribe_CreatesMxAccessSubscriptions()
{
// Verifies FIX 1: when a client disconnects and reconnects with the same tags,
// the new subscribe must create fresh MxAccess subscriptions (not skip them
// because old handles still exist).
var fake = new FakeScadaClient();
using var sm = new SubscriptionManager(fake);
using var cts = new CancellationTokenSource();
// First client subscribes
await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
fake.SubscribeCalls.Should().HaveCount(1);
fake.SubscribeCalls[0].Should().Contain("Motor.Speed");
// Client disconnects — unsubscribe removes the tag (ref count → 0)
sm.UnsubscribeClient("client1");
fake.UnsubscribeCalls.Should().HaveCount(1);
fake.UnsubscribeCalls[0].Should().Contain("Motor.Speed");
// Same client reconnects — must create a NEW MxAccess subscription
await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
fake.SubscribeCalls.Should().HaveCount(2, "new subscribe must create fresh MxAccess subscription");
fake.SubscribeCalls[1].Should().Contain("Motor.Speed");
}
[Fact]
public async Task SubscribeAfterUnsubscribe_SerializedByGate()
{
// Verifies FIX 1: subscribe and unsubscribe are serialized so they cannot
// interleave and cause the race condition.
var fake = new FakeScadaClient();
using var sm = new SubscriptionManager(fake);
using var cts = new CancellationTokenSource();
var tags = new[] { "Tag.A", "Tag.B", "Tag.C" };
// Subscribe, unsubscribe, re-subscribe in sequence
await sm.SubscribeAsync("session1", tags, cts.Token);
sm.UnsubscribeClient("session1");
await sm.SubscribeAsync("session2", tags, cts.Token);
// Both subscribes should have called SubscribeAsync on the scada client
fake.SubscribeCalls.Should().HaveCount(2);
// The unsubscribe in between should have cleaned up
fake.UnsubscribeCalls.Should().HaveCount(1);
// Data should flow to the new session
var reader = await sm.SubscribeAsync("session3", tags, cts.Token);
sm.OnTagValueChanged("Tag.A", Vtq.Good(1.0));
var result = await reader.ReadAsync(cts.Token);
result.vtq.Value.Should().Be(1.0);
}
[Fact]
public async Task OnTagValueChanged_NoDuplicateDelivery()
{
// Verifies FIX 2: each OnDataChange produces exactly one VTQ per client,
// not two (which happened when both stored callback and OnTagValueChanged
// property were invoked).
var fake = new FakeScadaClient();
using var sm = new SubscriptionManager(fake);
using var cts = new CancellationTokenSource();
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
// Deliver one update
sm.OnTagValueChanged("Motor.Speed", Vtq.Good(42.0));
// Should receive exactly one message
reader.TryRead(out var msg).Should().BeTrue();
msg.vtq.Value.Should().Be(42.0);
// No duplicate
reader.TryRead(out _).Should().BeFalse("each update should be delivered exactly once");
}
[Fact]
public async Task FailedSubscription_StoredAsPending_RetriedOnReconnect()
{
// Verifies FIX 3: when MxAccess is down during subscribe, tags are stored
// as pending and retried when NotifyReconnection is called.
var fake = new FakeScadaClient();
fake.FailSubscriptions = true;
using var sm = new SubscriptionManager(fake);
using var cts = new CancellationTokenSource();
// Subscribe while MxAccess is "down" — should not throw (errors are logged)
var reader = await sm.SubscribeAsync("client1", new[] { "Motor.Speed" }, cts.Token);
reader.Should().NotBeNull();
fake.SubscribeCalls.Should().HaveCount(1);
// MxAccess comes back up
fake.FailSubscriptions = false;
sm.NotifyReconnection();
// Give the async retry a moment to complete
await Task.Delay(100);
// Should have retried the subscription
fake.SubscribeCalls.Should().HaveCount(2, "pending subscriptions should be retried on reconnect");
fake.SubscribeCalls[1].Should().Contain("Motor.Speed");
}
}
}