fix(communication): resolve Communication-002/003 — gRPC reconnect stream cleanup and subscription map safety
This commit is contained in:
@@ -183,6 +183,15 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
return;
|
||||
}
|
||||
|
||||
// Unsubscribe the failed stream on the *previous* endpoint before reconnecting.
|
||||
// This cancels the local subscription CTS and -- where the channel is still
|
||||
// alive -- propagates gRPC cancellation to the site so its SiteStreamGrpcServer
|
||||
// stops the StreamRelayActor for this correlation ID, rather than leaving a
|
||||
// zombie relay actor until TCP RST / keepalive eventually detects the loss.
|
||||
var previousEndpoint = _useNodeA ? _grpcNodeAAddress : _grpcNodeBAddress;
|
||||
var previousClient = _grpcFactory.GetOrCreate(_siteIdentifier, previousEndpoint);
|
||||
previousClient.Unsubscribe(_correlationId);
|
||||
|
||||
// Flip to the other node
|
||||
_useNodeA = !_useNodeA;
|
||||
|
||||
|
||||
@@ -57,6 +57,32 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
_subscriptions[correlationId] = cts;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a subscription's CancellationTokenSource for a correlation ID.
|
||||
/// If an entry already exists for that correlation ID (a reconnect race where two
|
||||
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
|
||||
/// cancelled and disposed so it cannot leak. Internal for testability.
|
||||
/// </summary>
|
||||
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
|
||||
{
|
||||
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
|
||||
{
|
||||
prior.Cancel();
|
||||
prior.Dispose();
|
||||
}
|
||||
_subscriptions[correlationId] = cts;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the subscription entry for a correlation ID only if the stored CTS is
|
||||
/// exactly the one supplied. A racing replacement stream may already own the slot,
|
||||
/// in which case this is a no-op. Internal for testability.
|
||||
/// </summary>
|
||||
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
|
||||
{
|
||||
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Opens a server-streaming subscription for a specific instance.
|
||||
/// This is a long-running async method; the caller launches it as a background task.
|
||||
@@ -74,7 +100,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
||||
|
||||
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_subscriptions[correlationId] = cts;
|
||||
RegisterSubscription(correlationId, cts);
|
||||
|
||||
var request = new InstanceStreamRequest
|
||||
{
|
||||
@@ -103,7 +129,8 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
}
|
||||
finally
|
||||
{
|
||||
_subscriptions.TryRemove(correlationId, out _);
|
||||
// Remove only our own entry -- a racing reconnect may already own the slot.
|
||||
RemoveSubscription(correlationId, cts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user