Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs
T

258 lines
11 KiB
C#

using System.Collections.Concurrent;
using Google.Protobuf.WellKnownTypes;
using Grpc.Core;
using Grpc.Net.Client;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Communication;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest;
using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse;
using PullAuditEventsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullAuditEventsResponse;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Production <see cref="IPullAuditEventsClient"/> (Audit Log #23, M6) that the
/// central <see cref="SiteAuditReconciliationActor"/> uses to pull the next
/// reconciliation batch from a site over the <c>PullAuditEvents</c> unary gRPC
/// RPC served by <c>SiteStreamGrpcServer</c>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Endpoint resolution.</b> The actor passes only a <c>siteId</c>; this
/// client resolves it to a gRPC authority via <see cref="ISiteEnumerator"/>
/// (<see cref="SiteEntry.GrpcEndpoint"/>) on every call so a NodeA→NodeB
/// failover flip or an edited site address takes effect on the next tick — the
/// same liveness guarantee <c>SiteStreamGrpcClientFactory</c> gives the
/// real-time stream. A site with no registered endpoint yields an empty
/// response (no dial); reconciliation simply has nothing to pull from it.
/// </para>
/// <para>
/// <b>Fault tolerance.</b> Per the <see cref="IPullAuditEventsClient"/>
/// contract, tolerable transport faults (connection refused / site offline =
/// <see cref="StatusCode.Unavailable"/>, slow site = <see cref="StatusCode.DeadlineExceeded"/>,
/// shutdown = <see cref="StatusCode.Cancelled"/>, plus bare
/// <see cref="HttpRequestException"/> / <c>SocketException</c> before a gRPC
/// status is established) are caught and collapsed to an empty response — one
/// offline site must never sink the rest of the reconciliation tick. Any other
/// fault (e.g. a malformed reply that fails DTO mapping) is also swallowed to
/// empty: audit reconciliation is best-effort and a throw would only get
/// re-caught by the actor's own per-site guard.
/// </para>
/// <para>
/// <b>Testability.</b> The unary call is reached through the
/// <see cref="IPullAuditEventsInvoker"/> seam. Production binds
/// <see cref="GrpcPullAuditEventsInvoker"/> (one cached <see cref="GrpcChannel"/>
/// per endpoint, keepalive from <see cref="CommunicationOptions"/>); unit tests
/// inject a fake invoker so no real HTTP/2 endpoint is required.
/// </para>
/// </remarks>
public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient
{
private readonly ISiteEnumerator _sites;
private readonly IPullAuditEventsInvoker _invoker;
private readonly ILogger<GrpcPullAuditEventsClient> _logger;
/// <summary>
/// Creates the client over the given site enumerator and unary-call invoker.
/// </summary>
/// <param name="sites">Resolves a <c>siteId</c> to its gRPC endpoint.</param>
/// <param name="invoker">Seam that issues the <c>PullAuditEvents</c> unary RPC against a resolved endpoint.</param>
/// <param name="logger">Logger for transport-fault diagnostics.</param>
public GrpcPullAuditEventsClient(
ISiteEnumerator sites,
IPullAuditEventsInvoker invoker,
ILogger<GrpcPullAuditEventsClient> logger)
{
_sites = sites ?? throw new ArgumentNullException(nameof(sites));
_invoker = invoker ?? throw new ArgumentNullException(nameof(invoker));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public async Task<PullAuditEventsResponse> PullAsync(
string siteId,
DateTime sinceUtc,
int batchSize,
CancellationToken ct)
{
var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false);
if (endpoint is null)
{
// No gRPC address registered for the site — absence of an address is
// a configuration decision (mirrors ISiteEnumerator's own contract),
// not a runtime error, so there is simply nothing to pull.
_logger.LogDebug(
"PullAuditEvents skipped: no gRPC endpoint registered for site {SiteId}.", siteId);
return Empty;
}
var request = new ProtoPullRequest
{
// ReadPendingSinceAsync treats DateTime.MinValue as "from the start";
// EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind).
SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)),
BatchSize = batchSize,
};
ProtoPullResponse reply;
try
{
reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false);
}
catch (RpcException ex) when (IsTolerable(ex.StatusCode))
{
_logger.LogDebug(ex,
"PullAuditEvents tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.",
siteId, endpoint, ex.StatusCode);
return Empty;
}
catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException)
{
_logger.LogDebug(ex,
"PullAuditEvents connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.",
siteId, endpoint);
return Empty;
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Reconciliation tick was cancelled (host shutdown / scope dispose).
return Empty;
}
catch (Exception ex)
{
// Any other fault (e.g. a malformed reply that fails DTO mapping
// below would actually surface here only if mapping moved inline,
// but a non-RpcException transport fault wrapper lands here too).
// Audit reconciliation is best-effort; swallow to empty rather than
// throw — the actor's per-site guard would only re-catch it.
_logger.LogWarning(ex,
"PullAuditEvents unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.",
siteId, endpoint);
return Empty;
}
// Map proto DTOs to canonical AuditEvent records and order oldest-first
// (the wire is already ordered by the site queue, but the
// IPullAuditEventsClient contract is explicit, so sort defensively).
var events = reply.Events
.Select(AuditEventDtoMapper.FromDto)
.OrderBy(e => e.OccurredAtUtc)
.ToList();
return new PullAuditEventsResponse(events, reply.MoreAvailable);
}
private async Task<string?> ResolveEndpointAsync(string siteId, CancellationToken ct)
{
var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false);
foreach (var site in sites)
{
if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) &&
!string.IsNullOrWhiteSpace(site.GrpcEndpoint))
{
return site.GrpcEndpoint;
}
}
return null;
}
private static readonly PullAuditEventsResponse Empty =
new(Array.Empty<ZB.MOM.WW.Audit.AuditEvent>(), MoreAvailable: false);
private static bool IsTolerable(StatusCode code) => code is
StatusCode.Unavailable or
StatusCode.DeadlineExceeded or
StatusCode.Cancelled;
private static DateTime EnsureUtc(DateTime value) =>
value.Kind == DateTimeKind.Utc
? value
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
/// <summary>
/// Seam over the <c>PullAuditEvents</c> unary gRPC call against a resolved
/// site endpoint. Extracted so <see cref="GrpcPullAuditEventsClient"/> can
/// be unit-tested without a real <see cref="GrpcChannel"/>. Production binds
/// <see cref="GrpcPullAuditEventsInvoker"/>.
/// </summary>
public interface IPullAuditEventsInvoker
{
/// <summary>
/// Issues the <c>PullAuditEvents</c> unary RPC against <paramref name="endpoint"/>.
/// May throw <see cref="RpcException"/> / <see cref="HttpRequestException"/>
/// on transport faults — the caller classifies and swallows tolerable ones.
/// </summary>
/// <param name="endpoint">The site gRPC authority (e.g. <c>http://site-a:8083</c>).</param>
/// <param name="request">The wire-format pull request.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>The wire-format pull response.</returns>
Task<ProtoPullResponse> InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct);
}
}
/// <summary>
/// Production <see cref="GrpcPullAuditEventsClient.IPullAuditEventsInvoker"/>:
/// caches one <see cref="GrpcChannel"/> per endpoint (keepalive from
/// <see cref="CommunicationOptions"/>, mirroring <c>SiteStreamGrpcClient</c>)
/// and issues the unary <c>PullAuditEventsAsync</c> call. The cache flushes a
/// stale channel when an endpoint is re-keyed (NodeA→NodeB failover / address
/// edit), the same liveness guarantee <c>SiteStreamGrpcClientFactory</c> gives
/// the streaming client.
/// </summary>
public sealed class GrpcPullAuditEventsInvoker
: GrpcPullAuditEventsClient.IPullAuditEventsInvoker, IDisposable
{
private readonly ConcurrentDictionary<string, GrpcChannel> _channels = new(StringComparer.Ordinal);
private readonly CommunicationOptions _options;
/// <summary>
/// Creates the invoker using default <see cref="CommunicationOptions"/>.
/// </summary>
public GrpcPullAuditEventsInvoker()
: this(new CommunicationOptions())
{
}
/// <summary>
/// Creates the invoker, applying the configured gRPC keepalive settings to
/// every channel it opens.
/// </summary>
/// <param name="options">Communication options supplying gRPC keepalive timings.</param>
public GrpcPullAuditEventsInvoker(CommunicationOptions options)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <inheritdoc />
public async Task<ProtoPullResponse> InvokeAsync(
string endpoint, ProtoPullRequest request, CancellationToken ct)
{
var channel = _channels.GetOrAdd(endpoint, CreateChannel);
var client = new SiteStreamService.SiteStreamServiceClient(channel);
using var call = client.PullAuditEventsAsync(request, cancellationToken: ct);
return await call.ResponseAsync.ConfigureAwait(false);
}
private GrpcChannel CreateChannel(string endpoint) =>
GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
{
HttpHandler = new SocketsHttpHandler
{
KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay,
KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout,
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always,
},
});
/// <summary>Disposes all cached channels.</summary>
public void Dispose()
{
foreach (var channel in _channels.Values)
{
channel.Dispose();
}
_channels.Clear();
}
}