fix(communication): resolve Communication-012..015 — endpoint-aware gRPC client cache, address-change recreation, correlation-id validation, node-flip tests

This commit is contained in:
Joseph Doherty
2026-05-17 03:18:17 -04:00
parent a78c3bcb6f
commit a768135237
7 changed files with 289 additions and 14 deletions

View File

@@ -20,6 +20,14 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
private readonly ILogger? _logger;
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
/// <summary>
/// The gRPC endpoint (site node address) this client is bound to. The
/// <see cref="SiteStreamGrpcClientFactory"/> compares this against the requested
/// endpoint so a NodeA→NodeB failover flip (or a site address edit) is honoured
/// rather than served stale from cache.
/// </summary>
public virtual string Endpoint { get; } = string.Empty;
/// <summary>
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
@@ -44,6 +52,7 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
/// </summary>
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
{
Endpoint = endpoint;
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
@@ -67,6 +76,16 @@ public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
{
}
/// <summary>
/// Protected constructor for unit testing — records the endpoint without
/// opening a real gRPC channel, so endpoint-aware factory behaviour can be
/// exercised by test doubles.
/// </summary>
protected SiteStreamGrpcClient(string endpoint)
{
Endpoint = endpoint;
}
/// <summary>
/// Creates a test-only instance that has no gRPC channel. Used to test
/// Unsubscribe and Dispose behavior without needing a real endpoint.

View File

@@ -32,13 +32,40 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
}
/// <summary>
/// Returns an existing client for the site or creates a new one. The new
/// client is created via <see cref="CreateClient"/> and tracked so the
/// factory's <see cref="Dispose"/> / <see cref="DisposeAsync"/> release it.
/// Returns the cached client for the site, or creates a new one. If a client is
/// already cached but bound to a *different* <paramref name="grpcEndpoint"/> — the
/// NodeA→NodeB failover flip, or a site whose gRPC address was edited — the stale
/// client is disposed and replaced with one bound to the requested endpoint.
/// Communication-012/013: keying purely by site identifier and ignoring the
/// endpoint on a cache hit defeated debug-stream node failover and meant a
/// corrected gRPC address never took effect without a central restart.
/// </summary>
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
{
return _clients.GetOrAdd(siteIdentifier, _ => CreateClient(grpcEndpoint));
// Fast path: a client is cached and already bound to the requested endpoint.
if (_clients.TryGetValue(siteIdentifier, out var existing) &&
string.Equals(existing.Endpoint, grpcEndpoint, StringComparison.Ordinal))
{
return existing;
}
// Either no client is cached, or the cached one is bound to a different
// endpoint. AddOrUpdate atomically installs a client for the requested
// endpoint; the prior (stale) client, if any, is disposed afterwards.
SiteStreamGrpcClient? stale = null;
var client = _clients.AddOrUpdate(
siteIdentifier,
_ => CreateClient(grpcEndpoint),
(_, current) =>
{
if (string.Equals(current.Endpoint, grpcEndpoint, StringComparison.Ordinal))
return current;
stale = current;
return CreateClient(grpcEndpoint);
});
stale?.Dispose();
return client;
}
/// <summary>
@@ -53,7 +80,11 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
}
/// <summary>
/// Removes and disposes the client for the given site.
/// Removes and disposes the client for the given site. Site *address changes* are
/// now handled transparently by <see cref="GetOrCreate"/> (it disposes and recreates
/// a client whose endpoint no longer matches). This method remains the disposal
/// path for full site *removal* — call it when a site record is deleted so its
/// cached gRPC client does not linger for the life of the process.
/// </summary>
public async Task RemoveSiteAsync(string siteIdentifier)
{

View File

@@ -95,6 +95,18 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
if (!_ready)
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server not ready"));
// Communication-014: correlation_id arrives off the wire on a public gRPC
// endpoint and is used (below) to compose an Akka actor name. Akka actor names
// have a restricted character set — a id containing '/', whitespace, or other
// disallowed characters would make ActorOf throw InvalidActorNameException,
// escaping as an unhandled RPC fault. Reject unsafe ids cleanly up front.
if (string.IsNullOrEmpty(request.CorrelationId) ||
!ActorPath.IsValidPathElement(request.CorrelationId))
{
throw new RpcException(new GrpcStatus(
StatusCode.InvalidArgument, "correlation_id is missing or not a valid identifier"));
}
// Duplicate prevention -- cancel existing stream for this correlationId
if (_activeStreams.TryRemove(request.CorrelationId, out var existingEntry))
{