feat(audit): production gRPC IPullAuditEventsClient for site reconciliation

This commit is contained in:
Joseph Doherty
2026-06-15 09:41:13 -04:00
parent 9aa1259504
commit 2adc5767da
3 changed files with 488 additions and 0 deletions
@@ -0,0 +1,257 @@
using System.Collections.Concurrent;
using Google.Protobuf.WellKnownTypes;
using Grpc.Core;
using Grpc.Net.Client;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Communication;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest;
using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse;
using PullAuditEventsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullAuditEventsResponse;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Production <see cref="IPullAuditEventsClient"/> (Audit Log #23, M6) that the
/// central <see cref="SiteAuditReconciliationActor"/> uses to pull the next
/// reconciliation batch from a site over the <c>PullAuditEvents</c> unary gRPC
/// RPC served by <c>SiteStreamGrpcServer</c>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Endpoint resolution.</b> The actor passes only a <c>siteId</c>; this
/// client resolves it to a gRPC authority via <see cref="ISiteEnumerator"/>
/// (<see cref="SiteEntry.GrpcEndpoint"/>) on every call so a NodeA→NodeB
/// failover flip or an edited site address takes effect on the next tick — the
/// same liveness guarantee <c>SiteStreamGrpcClientFactory</c> gives the
/// real-time stream. A site with no registered endpoint yields an empty
/// response (no dial); reconciliation simply has nothing to pull from it.
/// </para>
/// <para>
/// <b>Fault tolerance.</b> Per the <see cref="IPullAuditEventsClient"/>
/// contract, tolerable transport faults (connection refused / site offline =
/// <see cref="StatusCode.Unavailable"/>, slow site = <see cref="StatusCode.DeadlineExceeded"/>,
/// shutdown = <see cref="StatusCode.Cancelled"/>, plus bare
/// <see cref="HttpRequestException"/> / <c>SocketException</c> before a gRPC
/// status is established) are caught and collapsed to an empty response — one
/// offline site must never sink the rest of the reconciliation tick. Any other
/// fault (e.g. a malformed reply that fails DTO mapping) is also swallowed to
/// empty: audit reconciliation is best-effort and a throw would only get
/// re-caught by the actor's own per-site guard.
/// </para>
/// <para>
/// <b>Testability.</b> The unary call is reached through the
/// <see cref="IPullAuditEventsInvoker"/> seam. Production binds
/// <see cref="GrpcPullAuditEventsInvoker"/> (one cached <see cref="GrpcChannel"/>
/// per endpoint, keepalive from <see cref="CommunicationOptions"/>); unit tests
/// inject a fake invoker so no real HTTP/2 endpoint is required.
/// </para>
/// </remarks>
public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient
{
private readonly ISiteEnumerator _sites;
private readonly IPullAuditEventsInvoker _invoker;
private readonly ILogger<GrpcPullAuditEventsClient> _logger;
/// <summary>
/// Creates the client over the given site enumerator and unary-call invoker.
/// </summary>
/// <param name="sites">Resolves a <c>siteId</c> to its gRPC endpoint.</param>
/// <param name="invoker">Seam that issues the <c>PullAuditEvents</c> unary RPC against a resolved endpoint.</param>
/// <param name="logger">Logger for transport-fault diagnostics.</param>
public GrpcPullAuditEventsClient(
ISiteEnumerator sites,
IPullAuditEventsInvoker invoker,
ILogger<GrpcPullAuditEventsClient> logger)
{
_sites = sites ?? throw new ArgumentNullException(nameof(sites));
_invoker = invoker ?? throw new ArgumentNullException(nameof(invoker));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public async Task<PullAuditEventsResponse> PullAsync(
string siteId,
DateTime sinceUtc,
int batchSize,
CancellationToken ct)
{
var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false);
if (endpoint is null)
{
// No gRPC address registered for the site — absence of an address is
// a configuration decision (mirrors ISiteEnumerator's own contract),
// not a runtime error, so there is simply nothing to pull.
_logger.LogDebug(
"PullAuditEvents skipped: no gRPC endpoint registered for site {SiteId}.", siteId);
return Empty;
}
var request = new ProtoPullRequest
{
// ReadPendingSinceAsync treats DateTime.MinValue as "from the start";
// EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind).
SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)),
BatchSize = batchSize,
};
ProtoPullResponse reply;
try
{
reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false);
}
catch (RpcException ex) when (IsTolerable(ex.StatusCode))
{
_logger.LogDebug(ex,
"PullAuditEvents tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.",
siteId, endpoint, ex.StatusCode);
return Empty;
}
catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException)
{
_logger.LogDebug(ex,
"PullAuditEvents connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.",
siteId, endpoint);
return Empty;
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Reconciliation tick was cancelled (host shutdown / scope dispose).
return Empty;
}
catch (Exception ex)
{
// Any other fault (e.g. a malformed reply that fails DTO mapping
// below would actually surface here only if mapping moved inline,
// but a non-RpcException transport fault wrapper lands here too).
// Audit reconciliation is best-effort; swallow to empty rather than
// throw — the actor's per-site guard would only re-catch it.
_logger.LogWarning(ex,
"PullAuditEvents unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.",
siteId, endpoint);
return Empty;
}
// Map proto DTOs to canonical AuditEvent records and order oldest-first
// (the wire is already ordered by the site queue, but the
// IPullAuditEventsClient contract is explicit, so sort defensively).
var events = reply.Events
.Select(AuditEventDtoMapper.FromDto)
.OrderBy(e => e.OccurredAtUtc)
.ToList();
return new PullAuditEventsResponse(events, reply.MoreAvailable);
}
private async Task<string?> ResolveEndpointAsync(string siteId, CancellationToken ct)
{
var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false);
foreach (var site in sites)
{
if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) &&
!string.IsNullOrWhiteSpace(site.GrpcEndpoint))
{
return site.GrpcEndpoint;
}
}
return null;
}
private static readonly PullAuditEventsResponse Empty =
new(Array.Empty<ZB.MOM.WW.Audit.AuditEvent>(), MoreAvailable: false);
private static bool IsTolerable(StatusCode code) => code is
StatusCode.Unavailable or
StatusCode.DeadlineExceeded or
StatusCode.Cancelled;
private static DateTime EnsureUtc(DateTime value) =>
value.Kind == DateTimeKind.Utc
? value
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
/// <summary>
/// Seam over the <c>PullAuditEvents</c> unary gRPC call against a resolved
/// site endpoint. Extracted so <see cref="GrpcPullAuditEventsClient"/> can
/// be unit-tested without a real <see cref="GrpcChannel"/>. Production binds
/// <see cref="GrpcPullAuditEventsInvoker"/>.
/// </summary>
public interface IPullAuditEventsInvoker
{
/// <summary>
/// Issues the <c>PullAuditEvents</c> unary RPC against <paramref name="endpoint"/>.
/// May throw <see cref="RpcException"/> / <see cref="HttpRequestException"/>
/// on transport faults — the caller classifies and swallows tolerable ones.
/// </summary>
/// <param name="endpoint">The site gRPC authority (e.g. <c>http://site-a:8083</c>).</param>
/// <param name="request">The wire-format pull request.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>The wire-format pull response.</returns>
Task<ProtoPullResponse> InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct);
}
}
/// <summary>
/// Production <see cref="GrpcPullAuditEventsClient.IPullAuditEventsInvoker"/>:
/// caches one <see cref="GrpcChannel"/> per endpoint (keepalive from
/// <see cref="CommunicationOptions"/>, mirroring <c>SiteStreamGrpcClient</c>)
/// and issues the unary <c>PullAuditEventsAsync</c> call. The cache flushes a
/// stale channel when an endpoint is re-keyed (NodeA→NodeB failover / address
/// edit), the same liveness guarantee <c>SiteStreamGrpcClientFactory</c> gives
/// the streaming client.
/// </summary>
public sealed class GrpcPullAuditEventsInvoker
: GrpcPullAuditEventsClient.IPullAuditEventsInvoker, IDisposable
{
private readonly ConcurrentDictionary<string, GrpcChannel> _channels = new(StringComparer.Ordinal);
private readonly CommunicationOptions _options;
/// <summary>
/// Creates the invoker using default <see cref="CommunicationOptions"/>.
/// </summary>
public GrpcPullAuditEventsInvoker()
: this(new CommunicationOptions())
{
}
/// <summary>
/// Creates the invoker, applying the configured gRPC keepalive settings to
/// every channel it opens.
/// </summary>
/// <param name="options">Communication options supplying gRPC keepalive timings.</param>
public GrpcPullAuditEventsInvoker(CommunicationOptions options)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <inheritdoc />
public async Task<ProtoPullResponse> InvokeAsync(
string endpoint, ProtoPullRequest request, CancellationToken ct)
{
var channel = _channels.GetOrAdd(endpoint, CreateChannel);
var client = new SiteStreamService.SiteStreamServiceClient(channel);
using var call = client.PullAuditEventsAsync(request, cancellationToken: ct);
return await call.ResponseAsync.ConfigureAwait(false);
}
private GrpcChannel CreateChannel(string endpoint) =>
GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
{
HttpHandler = new SocketsHttpHandler
{
KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay,
KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout,
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always,
},
});
/// <summary>Disposes all cached channels.</summary>
public void Dispose()
{
foreach (var channel in _channels.Values)
{
channel.Dispose();
}
_channels.Clear();
}
}
@@ -362,4 +362,69 @@ public static class ServiceCollectionExtensions
return services;
}
/// <summary>
/// Audit Log (#23) M6 — central-only registration of the production
/// <see cref="IPullAuditEventsClient"/> (<see cref="GrpcPullAuditEventsClient"/>)
/// and its unary-call invoker (<see cref="GrpcPullAuditEventsInvoker"/>) used
/// by <see cref="SiteAuditReconciliationActor"/> to pull reconciliation
/// batches from each site over the <c>PullAuditEvents</c> gRPC RPC.
/// </summary>
/// <remarks>
/// <para>
/// Kept out of <see cref="AddAuditLog"/> — which also runs on site
/// composition roots — because the client dials sites and resolves
/// <see cref="ISiteEnumerator"/> (a central-only collaborator wired
/// alongside the reconciliation singleton). Folding it into
/// <see cref="AddAuditLog"/> would register a site-dialing client on every
/// site host, violating the "every <c>Add*</c> call is safe from any
/// composition root" invariant. This helper is the central analogue of
/// <see cref="AddAuditLogCentralMaintenance"/>.
/// </para>
/// <para>
/// The <see cref="GrpcPullAuditEventsInvoker"/> binds with default
/// <see cref="ZB.MOM.WW.ScadaBridge.Communication.CommunicationOptions"/>
/// keepalive unless an <c>IOptions&lt;CommunicationOptions&gt;</c> is
/// already registered, in which case the configured timings flow through —
/// matching how <c>SiteStreamGrpcClientFactory</c> takes its keepalive from
/// the same options.
/// </para>
/// <para>
/// <see cref="ISiteEnumerator"/> is NOT registered here: its production
/// implementation (wrapping <c>ISiteRepository</c>) ships with the
/// reconciliation-singleton wiring in the Host. The client resolves the
/// enumerator lazily at actor-construction time, so this binding is safe to
/// issue before the enumerator binding lands.
/// </para>
/// </remarks>
/// <param name="services">The service collection to register into.</param>
/// <returns>The same <see cref="IServiceCollection"/> for chaining.</returns>
public static IServiceCollection AddAuditLogCentralReconciliationClient(
this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
// The invoker owns the per-endpoint GrpcChannel cache, so it must be a
// singleton — a fresh invoker per resolution would leak channels.
// Resolve CommunicationOptions if present (the central Host binds it),
// otherwise fall back to defaults so this helper stays standalone.
services.TryAddSingleton<GrpcPullAuditEventsInvoker>(sp =>
{
var options = sp
.GetService<Microsoft.Extensions.Options.IOptions<
ZB.MOM.WW.ScadaBridge.Communication.CommunicationOptions>>();
return options is null
? new GrpcPullAuditEventsInvoker()
: new GrpcPullAuditEventsInvoker(options.Value);
});
services.TryAddSingleton<GrpcPullAuditEventsClient.IPullAuditEventsInvoker>(
sp => sp.GetRequiredService<GrpcPullAuditEventsInvoker>());
services.TryAddSingleton<IPullAuditEventsClient>(sp => new GrpcPullAuditEventsClient(
sp.GetRequiredService<ISiteEnumerator>(),
sp.GetRequiredService<GrpcPullAuditEventsClient.IPullAuditEventsInvoker>(),
sp.GetRequiredService<ILogger<GrpcPullAuditEventsClient>>()));
return services;
}
}