From 2adc5767da550ffec6db037d420b782e8df9ba38 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 09:41:13 -0400 Subject: [PATCH 01/14] feat(audit): production gRPC IPullAuditEventsClient for site reconciliation --- .../Central/GrpcPullAuditEventsClient.cs | 257 ++++++++++++++++++ .../ServiceCollectionExtensions.cs | 65 +++++ .../Central/GrpcPullAuditEventsClientTests.cs | 166 +++++++++++ 3 files changed, 488 insertions(+) create mode 100644 src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs new file mode 100644 index 00000000..f836945b --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs @@ -0,0 +1,257 @@ +using System.Collections.Concurrent; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.Communication; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse; +using PullAuditEventsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullAuditEventsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production (Audit Log #23, M6) that the +/// central uses to pull the next +/// reconciliation batch from a site over the PullAuditEvents unary gRPC +/// RPC served by SiteStreamGrpcServer. +/// +/// +/// +/// Endpoint resolution. The actor passes only a siteId; this +/// client resolves it to a gRPC authority via +/// () on every call so a NodeA→NodeB +/// failover flip or an edited site address takes effect on the next tick — the +/// same liveness guarantee SiteStreamGrpcClientFactory gives the +/// real-time stream. A site with no registered endpoint yields an empty +/// response (no dial); reconciliation simply has nothing to pull from it. +/// +/// +/// Fault tolerance. Per the +/// contract, tolerable transport faults (connection refused / site offline = +/// , slow site = , +/// shutdown = , plus bare +/// / SocketException before a gRPC +/// status is established) are caught and collapsed to an empty response — one +/// offline site must never sink the rest of the reconciliation tick. Any other +/// fault (e.g. a malformed reply that fails DTO mapping) is also swallowed to +/// empty: audit reconciliation is best-effort and a throw would only get +/// re-caught by the actor's own per-site guard. +/// +/// +/// Testability. The unary call is reached through the +/// seam. Production binds +/// (one cached +/// per endpoint, keepalive from ); unit tests +/// inject a fake invoker so no real HTTP/2 endpoint is required. +/// +/// +public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient +{ + private readonly ISiteEnumerator _sites; + private readonly IPullAuditEventsInvoker _invoker; + private readonly ILogger _logger; + + /// + /// Creates the client over the given site enumerator and unary-call invoker. + /// + /// Resolves a siteId to its gRPC endpoint. + /// Seam that issues the PullAuditEvents unary RPC against a resolved endpoint. + /// Logger for transport-fault diagnostics. + public GrpcPullAuditEventsClient( + ISiteEnumerator sites, + IPullAuditEventsInvoker invoker, + ILogger logger) + { + _sites = sites ?? throw new ArgumentNullException(nameof(sites)); + _invoker = invoker ?? throw new ArgumentNullException(nameof(invoker)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public async Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct) + { + var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false); + if (endpoint is null) + { + // No gRPC address registered for the site — absence of an address is + // a configuration decision (mirrors ISiteEnumerator's own contract), + // not a runtime error, so there is simply nothing to pull. + _logger.LogDebug( + "PullAuditEvents skipped: no gRPC endpoint registered for site {SiteId}.", siteId); + return Empty; + } + + var request = new ProtoPullRequest + { + // ReadPendingSinceAsync treats DateTime.MinValue as "from the start"; + // EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind). + SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)), + BatchSize = batchSize, + }; + + ProtoPullResponse reply; + try + { + reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false); + } + catch (RpcException ex) when (IsTolerable(ex.StatusCode)) + { + _logger.LogDebug(ex, + "PullAuditEvents tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.", + siteId, endpoint, ex.StatusCode); + return Empty; + } + catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException) + { + _logger.LogDebug(ex, + "PullAuditEvents connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + // Reconciliation tick was cancelled (host shutdown / scope dispose). + return Empty; + } + catch (Exception ex) + { + // Any other fault (e.g. a malformed reply that fails DTO mapping + // below would actually surface here only if mapping moved inline, + // but a non-RpcException transport fault wrapper lands here too). + // Audit reconciliation is best-effort; swallow to empty rather than + // throw — the actor's per-site guard would only re-catch it. + _logger.LogWarning(ex, + "PullAuditEvents unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + + // Map proto DTOs to canonical AuditEvent records and order oldest-first + // (the wire is already ordered by the site queue, but the + // IPullAuditEventsClient contract is explicit, so sort defensively). + var events = reply.Events + .Select(AuditEventDtoMapper.FromDto) + .OrderBy(e => e.OccurredAtUtc) + .ToList(); + + return new PullAuditEventsResponse(events, reply.MoreAvailable); + } + + private async Task ResolveEndpointAsync(string siteId, CancellationToken ct) + { + var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false); + foreach (var site in sites) + { + if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) && + !string.IsNullOrWhiteSpace(site.GrpcEndpoint)) + { + return site.GrpcEndpoint; + } + } + return null; + } + + private static readonly PullAuditEventsResponse Empty = + new(Array.Empty(), MoreAvailable: false); + + private static bool IsTolerable(StatusCode code) => code is + StatusCode.Unavailable or + StatusCode.DeadlineExceeded or + StatusCode.Cancelled; + + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc + ? value + : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + + /// + /// Seam over the PullAuditEvents unary gRPC call against a resolved + /// site endpoint. Extracted so can + /// be unit-tested without a real . Production binds + /// . + /// + public interface IPullAuditEventsInvoker + { + /// + /// Issues the PullAuditEvents unary RPC against . + /// May throw / + /// on transport faults — the caller classifies and swallows tolerable ones. + /// + /// The site gRPC authority (e.g. http://site-a:8083). + /// The wire-format pull request. + /// Cancellation token. + /// The wire-format pull response. + Task InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct); + } +} + +/// +/// Production : +/// caches one per endpoint (keepalive from +/// , mirroring SiteStreamGrpcClient) +/// and issues the unary PullAuditEventsAsync call. The cache flushes a +/// stale channel when an endpoint is re-keyed (NodeA→NodeB failover / address +/// edit), the same liveness guarantee SiteStreamGrpcClientFactory gives +/// the streaming client. +/// +public sealed class GrpcPullAuditEventsInvoker + : GrpcPullAuditEventsClient.IPullAuditEventsInvoker, IDisposable +{ + private readonly ConcurrentDictionary _channels = new(StringComparer.Ordinal); + private readonly CommunicationOptions _options; + + /// + /// Creates the invoker using default . + /// + public GrpcPullAuditEventsInvoker() + : this(new CommunicationOptions()) + { + } + + /// + /// Creates the invoker, applying the configured gRPC keepalive settings to + /// every channel it opens. + /// + /// Communication options supplying gRPC keepalive timings. + public GrpcPullAuditEventsInvoker(CommunicationOptions options) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + public async Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + var channel = _channels.GetOrAdd(endpoint, CreateChannel); + var client = new SiteStreamService.SiteStreamServiceClient(channel); + using var call = client.PullAuditEventsAsync(request, cancellationToken: ct); + return await call.ResponseAsync.ConfigureAwait(false); + } + + private GrpcChannel CreateChannel(string endpoint) => + GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions + { + HttpHandler = new SocketsHttpHandler + { + KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay, + KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout, + KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always, + }, + }); + + /// Disposes all cached channels. + public void Dispose() + { + foreach (var channel in _channels.Values) + { + channel.Dispose(); + } + _channels.Clear(); + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs index 6b1e0255..764ee91a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs @@ -362,4 +362,69 @@ public static class ServiceCollectionExtensions return services; } + + /// + /// Audit Log (#23) M6 — central-only registration of the production + /// () + /// and its unary-call invoker () used + /// by to pull reconciliation + /// batches from each site over the PullAuditEvents gRPC RPC. + /// + /// + /// + /// Kept out of — which also runs on site + /// composition roots — because the client dials sites and resolves + /// (a central-only collaborator wired + /// alongside the reconciliation singleton). Folding it into + /// would register a site-dialing client on every + /// site host, violating the "every Add* call is safe from any + /// composition root" invariant. This helper is the central analogue of + /// . + /// + /// + /// The binds with default + /// + /// keepalive unless an IOptions<CommunicationOptions> is + /// already registered, in which case the configured timings flow through — + /// matching how SiteStreamGrpcClientFactory takes its keepalive from + /// the same options. + /// + /// + /// is NOT registered here: its production + /// implementation (wrapping ISiteRepository) ships with the + /// reconciliation-singleton wiring in the Host. The client resolves the + /// enumerator lazily at actor-construction time, so this binding is safe to + /// issue before the enumerator binding lands. + /// + /// + /// The service collection to register into. + /// The same for chaining. + public static IServiceCollection AddAuditLogCentralReconciliationClient( + this IServiceCollection services) + { + ArgumentNullException.ThrowIfNull(services); + + // The invoker owns the per-endpoint GrpcChannel cache, so it must be a + // singleton — a fresh invoker per resolution would leak channels. + // Resolve CommunicationOptions if present (the central Host binds it), + // otherwise fall back to defaults so this helper stays standalone. + services.TryAddSingleton(sp => + { + var options = sp + .GetService>(); + return options is null + ? new GrpcPullAuditEventsInvoker() + : new GrpcPullAuditEventsInvoker(options.Value); + }); + services.TryAddSingleton( + sp => sp.GetRequiredService()); + + services.TryAddSingleton(sp => new GrpcPullAuditEventsClient( + sp.GetRequiredService(), + sp.GetRequiredService(), + sp.GetRequiredService>())); + + return services; + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs new file mode 100644 index 00000000..c6a42b7d --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs @@ -0,0 +1,166 @@ +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.Audit; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using Google.Protobuf.WellKnownTypes; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Bundle (M6) tests for — the +/// production that dials a site over gRPC +/// and issues the PullAuditEvents unary RPC for the reconciliation loop. +/// The real GrpcChannel is replaced by an injected +/// seam so the +/// client's mapping / ordering / fault-swallowing behaviour can be asserted +/// without standing up a Kestrel HTTP/2 endpoint. +/// +public class GrpcPullAuditEventsClientTests +{ + private static readonly DateTime BaseTime = + new(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + + /// Static enumerator returning a fixed site→endpoint map. + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Test invoker: records the endpoint + request it was asked to dial, then + /// returns a scripted proto response (or throws a scripted exception so the + /// fault-swallowing path can be exercised). + /// + private sealed class FakeInvoker : GrpcPullAuditEventsClient.IPullAuditEventsInvoker + { + public string? Endpoint { get; private set; } + public ProtoPullRequest? Request { get; private set; } + public int CallCount { get; private set; } + + private readonly ProtoPullResponse? _response; + private readonly Exception? _throw; + + private FakeInvoker(ProtoPullResponse? response, Exception? toThrow) + { + _response = response; + _throw = toThrow; + } + + public static FakeInvoker Returning(ProtoPullResponse response) => new(response, null); + public static FakeInvoker Throwing(Exception ex) => new(null, ex); + + public Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + CallCount++; + Endpoint = endpoint; + Request = request; + if (_throw is not null) + { + throw _throw; + } + return Task.FromResult(_response!); + } + } + + private static AuditEventDto Dto(Guid id, DateTime occurredAtUtc) => + AuditEventDtoMapper.ToDto(ScadaBridgeAuditEventFactory.Create( + eventId: id, + occurredAtUtc: occurredAtUtc, + channel: AuditChannel.ApiOutbound, + kind: AuditKind.ApiCall, + status: AuditStatus.Delivered, + sourceSiteId: "site-a")); + + [Fact] + public async Task PullAsync_dials_the_resolved_endpoint_and_maps_events_oldest_first() + { + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + // Wire is delivered newest-first on purpose to prove the client sorts. + var proto = new ProtoPullResponse { MoreAvailable = true }; + proto.Events.Add(Dto(newer, BaseTime.AddMinutes(5))); + proto.Events.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + // Endpoint resolution + request shaping. + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(256, invoker.Request!.BatchSize); + Assert.Equal(BaseTime, invoker.Request.SinceUtc.ToDateTime()); + + // Mapping + ordering + MoreAvailable surface. + Assert.True(result.MoreAvailable); + Assert.Equal(2, result.Events.Count); + Assert.Equal(older, result.Events[0].EventId); + Assert.Equal(newer, result.Events[1].EventId); + } + + [Fact] + public async Task PullAsync_returns_empty_when_site_endpoint_is_unknown() + { + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(), // no sites registered + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + Assert.Equal(0, invoker.CallCount); // never dialled — nothing to dial + } + + [Theory] + [InlineData(StatusCode.Unavailable)] // connection refused / site offline + [InlineData(StatusCode.DeadlineExceeded)] // slow site / network blip + [InlineData(StatusCode.Cancelled)] + public async Task PullAsync_swallows_tolerable_transport_faults_to_empty_response(StatusCode code) + { + var invoker = FakeInvoker.Throwing(new RpcException(new Status(code, "transport fault"))); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // MUST NOT throw — per the IPullAuditEventsClient contract. + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_connection_layer_faults_to_empty_response() + { + // A bare HttpRequestException (e.g. DNS / refused socket before a gRPC + // status is established) is also tolerable. + var invoker = FakeInvoker.Throwing(new HttpRequestException("connection refused")); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } +} From d03c2af9a102201268b182b01eff1f9cd5b50d76 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 09:49:43 -0400 Subject: [PATCH 02/14] fix(audit): race-safe channel cache + UTC-kind cursor handling in gRPC pull client (review) --- .../Central/GrpcPullAuditEventsClient.cs | 52 +++++++++++++++---- .../Central/GrpcPullAuditEventsClientTests.cs | 49 +++++++++++++++++ 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs index f836945b..bad3e88f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs @@ -114,9 +114,13 @@ public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient siteId, endpoint); return Empty; } - catch (OperationCanceledException) when (ct.IsCancellationRequested) + catch (OperationCanceledException) { - // Reconciliation tick was cancelled (host shutdown / scope dispose). + // Reconciliation tick was cancelled — either the caller's token + // (host shutdown / scope dispose) or an internal gRPC deadline / + // linked-CTS cancellation. Both are tolerable for a best-effort + // pull; collapse to empty rather than letting an internal + // cancellation land noisily in the catch-all below. return Empty; } catch (Exception ex) @@ -165,10 +169,13 @@ public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient StatusCode.DeadlineExceeded or StatusCode.Cancelled; + // All ScadaBridge timestamps are UTC by invariant. A non-UTC cursor (the + // reconciliation cursor starts at DateTime.MinValue, Kind=Unspecified) is + // therefore treated AS UTC — never ToUniversalTime()-converted: on a host + // with a positive UTC offset MinValue.ToUniversalTime() underflows and + // Timestamp.FromDateTime throws, crashing the first pull for every site. private static DateTime EnsureUtc(DateTime value) => - value.Kind == DateTimeKind.Utc - ? value - : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); /// /// Seam over the PullAuditEvents unary gRPC call against a resolved @@ -195,10 +202,15 @@ public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient /// Production : /// caches one per endpoint (keepalive from /// , mirroring SiteStreamGrpcClient) -/// and issues the unary PullAuditEventsAsync call. The cache flushes a -/// stale channel when an endpoint is re-keyed (NodeA→NodeB failover / address -/// edit), the same liveness guarantee SiteStreamGrpcClientFactory gives -/// the streaming client. +/// and issues the unary PullAuditEventsAsync call. The cache is keyed by +/// endpoint string, so a changed site address (NodeA→NodeB failover flip / an +/// edited gRPC address) is reached as soon as the resolver hands the new +/// endpoint to — it creates a fresh channel for the +/// new address. Unlike SiteStreamGrpcClientFactory (keyed by siteId, +/// which actively evicts a re-keyed client), the channel for the previous +/// address is NOT actively evicted here; it lingers idle until +/// . Idle channels hold no streams, so this is a minor +/// cache footprint cost, not a correctness or liveness gap. /// public sealed class GrpcPullAuditEventsInvoker : GrpcPullAuditEventsClient.IPullAuditEventsInvoker, IDisposable @@ -228,12 +240,32 @@ public sealed class GrpcPullAuditEventsInvoker public async Task InvokeAsync( string endpoint, ProtoPullRequest request, CancellationToken ct) { - var channel = _channels.GetOrAdd(endpoint, CreateChannel); + var channel = GetOrCreateChannel(endpoint); var client = new SiteStreamService.SiteStreamServiceClient(channel); using var call = client.PullAuditEventsAsync(request, cancellationToken: ct); return await call.ResponseAsync.ConfigureAwait(false); } + // Race-safe channel cache. ConcurrentDictionary.GetOrAdd(key, valueFactory) + // does NOT serialize the factory, so two concurrent first dials of the same + // endpoint can both build a GrpcChannel (each holds an HTTP/2 connection + // pool) and the loser would leak. Create-then-GetOrAdd-then-dispose-if-lost + // mirrors SiteStreamGrpcClientFactory: only the channel actually installed + // survives; a channel that lost the race is disposed immediately. + private GrpcChannel GetOrCreateChannel(string endpoint) + { + if (!_channels.TryGetValue(endpoint, out var channel)) + { + var created = CreateChannel(endpoint); + channel = _channels.GetOrAdd(endpoint, created); + if (!ReferenceEquals(channel, created)) + { + created.Dispose(); + } + } + return channel; + } + private GrpcChannel CreateChannel(string endpoint) => GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions { diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs index c6a42b7d..7664b36c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs @@ -163,4 +163,53 @@ public class GrpcPullAuditEventsClientTests Assert.Empty(result.Events); Assert.False(result.MoreAvailable); } + + [Fact] + public async Task PullAsync_swallows_unexpected_faults_to_empty_response() + { + // I3(a): the catch-all path. A non-transport fault (e.g. a mapping/ + // protocol error surfacing as InvalidOperationException) must still be + // swallowed to empty — audit reconciliation is best-effort and a throw + // would only get re-caught by the actor's per-site guard. + var invoker = FakeInvoker.Throwing(new InvalidOperationException("boom")); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_with_minvalue_unspecified_cursor_does_not_throw_and_dials() + { + // I3(b) / guards I2: the reconciliation cursor starts at DateTime.MinValue + // with Kind=Unspecified. EnsureUtc must treat it AS UTC (per the system-wide + // "all timestamps are UTC" invariant) and NOT call ToUniversalTime() — on a + // host with a positive UTC offset that underflows and Timestamp.FromDateTime + // throws ArgumentOutOfRangeException, crashing the FIRST pull for every site. + var minUnspecified = default(DateTime); // DateTime.MinValue, Kind=Unspecified + Assert.Equal(DateTimeKind.Unspecified, minUnspecified.Kind); + + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // MUST NOT throw — must dial successfully. + var result = await sut.PullAsync("site-a", minUnspecified, batchSize: 256, CancellationToken.None); + + Assert.Equal(1, invoker.CallCount); + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + // The unspecified-MinValue cursor is carried through verbatim as UTC + // MinValue (no local-TZ conversion). + Assert.Equal(DateTime.MinValue, invoker.Request!.SinceUtc.ToDateTime()); + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } } From 36a08a414530fa278881f1d2c835088433778d8c Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 10:00:44 -0400 Subject: [PATCH 03/14] feat(audit): start purge + reconciliation singletons; production ISiteEnumerator --- .../Central/SiteEnumerator.cs | 75 ++++++++++++ .../ServiceCollectionExtensions.cs | 43 ++++++- .../Actors/AkkaHostedService.cs | 111 ++++++++++++++++++ src/ZB.MOM.WW.ScadaBridge.Host/Program.cs | 7 ++ .../Central/SiteEnumeratorTests.cs | 91 ++++++++++++++ .../ActorPathTests.cs | 16 +++ 6 files changed, 337 insertions(+), 6 deletions(-) create mode 100644 src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs new file mode 100644 index 00000000..159b4ae1 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs @@ -0,0 +1,75 @@ +using Microsoft.Extensions.DependencyInjection; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production backing the central +/// . Enumerates the configured sites +/// from the config DB via and +/// projects each site to a using the site's +/// SiteIdentifier as the cursor key and its GrpcNodeAAddress as +/// the dial target. +/// +/// +/// +/// Scope-per-call. is a SCOPED EF Core +/// service (registered by AddConfigurationDatabase); resolving it from +/// the root provider would fail DI scope validation. The enumerator therefore +/// takes the root and opens one +/// CreateAsyncScope per call — mirroring the +/// per-tick scope pattern in . +/// +/// +/// Blank-address skip. Sites with no GrpcNodeAAddress configured +/// are silently skipped: the reconciliation pull cannot dial them, but absence +/// of an address is a configuration decision, not a runtime error (per the +/// contract). +/// +/// +/// NodeA-only first cut. This implementation always uses NodeA's gRPC +/// address. NodeA/NodeB failover endpoint selection (dial NodeB when NodeA is +/// unreachable) is a follow-up — the shape already +/// carries a single endpoint, so failover will live in the puller/client, not +/// here. +/// +/// +public sealed class SiteEnumerator : ISiteEnumerator +{ + private readonly IServiceProvider _services; + + /// + /// Initializes the enumerator with the root service provider used to open a + /// fresh DI scope per enumeration call. + /// + /// Root service provider for resolving the scoped . + public SiteEnumerator(IServiceProvider services) + { + ArgumentNullException.ThrowIfNull(services); + _services = services; + } + + /// + public async Task> EnumerateAsync(CancellationToken ct = default) + { + await using var scope = _services.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + + var sites = await repository.GetAllSitesAsync(ct).ConfigureAwait(false); + + var entries = new List(sites.Count); + foreach (var site in sites) + { + // First cut: NodeA's gRPC address is the dial target. NodeA/NodeB + // failover endpoint selection is a follow-up. + if (string.IsNullOrWhiteSpace(site.GrpcNodeAAddress)) + { + continue; + } + + entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress)); + } + + return entries; + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs index 764ee91a..c1114c1e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs @@ -50,6 +50,12 @@ public static class ServiceCollectionExtensions /// Configuration section bound to . public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance"; + /// Configuration section bound to . + public const string PurgeSectionName = "AuditLog:Purge"; + + /// Configuration section bound to . + public const string ReconciliationSectionName = "AuditLog:Reconciliation"; + /// /// Registers the Audit Log (#23) component services: options, the site /// SQLite writer chain (primary + ring fallback + failure-counter sink), @@ -390,19 +396,44 @@ public static class ServiceCollectionExtensions /// the same options. /// /// - /// is NOT registered here: its production - /// implementation (wrapping ISiteRepository) ships with the - /// reconciliation-singleton wiring in the Host. The client resolves the - /// enumerator lazily at actor-construction time, so this binding is safe to - /// issue before the enumerator binding lands. + /// The production (, + /// wrapping the scoped ISiteRepository) IS registered here, alongside + /// the + + /// bindings — so the two central singletons wired in the Host + /// ( + ) + /// can resolve their collaborators + options from the same central-only + /// helper. Keeping the enumerator + options on this central path preserves + /// the "every Add* call is safe from any composition root" invariant: + /// a site host never calls this helper, so it never registers a + /// site-dialing enumerator. /// /// /// The service collection to register into. + /// Application configuration used to bind the purge + reconciliation options sections. /// The same for chaining. public static IServiceCollection AddAuditLogCentralReconciliationClient( - this IServiceCollection services) + this IServiceCollection services, + IConfiguration config) { ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(config); + + // Production ISiteEnumerator: projects the config-DB Site rows into the + // reconciliation targets the SiteAuditReconciliationActor polls. Scoped + // ISiteRepository is resolved per call inside the enumerator, so the + // singleton takes the ROOT provider (mirrors the per-tick scope pattern + // in SiteAuditReconciliationActor / AuditLogPurgeActor). + services.TryAddSingleton(sp => new SiteEnumerator(sp)); + + // Bind the two central-singleton options to their config sections. + // Defaults are fine when the section is absent (24 h purge cadence / + // 5 min reconciliation tick); production exposes IntervalHours / + // ReconciliationIntervalSeconds only — the test-only *Override knobs + // are intentionally not bound. + services.AddOptions() + .Bind(config.GetSection(PurgeSectionName)); + services.AddOptions() + .Bind(config.GetSection(ReconciliationSectionName)); // The invoker owns the per-endpoint GrpcChannel cache, so it must be a // singleton — a fresh invoker per resolution would leak channels. diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs index ab93ce56..864f2e63 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs @@ -588,6 +588,117 @@ akka {{ _logger.LogInformation( "SiteCallAuditActor singleton created and registered with CentralCommunicationActor"); + // Audit Log (#23) M6 Bundle B/C — start the two central-only maintenance + // singletons that were fully implemented but never instantiated: the + // daily AuditLog partition-switch purge (AuditLogPurgeActor) and the + // periodic per-site audit-event reconciliation pull + // (SiteAuditReconciliationActor). Both mirror the SiteCallAudit / + // NotificationOutbox singleton pattern above: a ClusterSingletonManager + // pins the actor to the active central node, a ClusterSingletonProxy + // gives a stable address, and a PhaseClusterLeave graceful-stop task + // drains the in-flight tick before handover. Options + the production + // ISiteEnumerator + IPullAuditEventsClient come from + // AddAuditLogCentralReconciliationClient (central composition root only). + // Both actors take the root IServiceProvider and open their own per-tick + // DI scope because IAuditLogRepository / ISiteRepository are scoped EF + // Core services. + var auditPurgeLogger = _serviceProvider.GetRequiredService() + .CreateLogger(); + var auditPurgeOptions = _serviceProvider + .GetRequiredService>(); + var auditLogOptions = _serviceProvider + .GetRequiredService>(); + + var auditPurgeSingletonProps = ClusterSingletonManager.Props( + singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPurgeActor( + _serviceProvider, + auditPurgeOptions, + auditLogOptions, + auditPurgeLogger)), + terminationMessage: PoisonPill.Instance, + settings: ClusterSingletonManagerSettings.Create(_actorSystem!) + .WithSingletonName("audit-log-purge")); + var auditPurgeSingletonManager = + _actorSystem!.ActorOf(auditPurgeSingletonProps, "audit-log-purge-singleton"); + + var auditPurgeShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem); + auditPurgeShutdown.AddTask( + Akka.Actor.CoordinatedShutdown.PhaseClusterLeave, + "drain-audit-log-purge-singleton", + async () => + { + try + { + await auditPurgeSingletonManager.GracefulStop(TimeSpan.FromSeconds(10)); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "AuditLogPurge singleton did not drain within the graceful-stop " + + "timeout; falling through to PoisonPill handover"); + } + return Akka.Done.Instance; + }); + + var auditPurgeProxyProps = ClusterSingletonProxy.Props( + singletonManagerPath: "/user/audit-log-purge-singleton", + settings: ClusterSingletonProxySettings.Create(_actorSystem) + .WithSingletonName("audit-log-purge")); + _actorSystem.ActorOf(auditPurgeProxyProps, "audit-log-purge-proxy"); + _logger.LogInformation("AuditLogPurgeActor singleton created"); + + // SiteAuditReconciliationActor — self-healing fallback puller. Resolves + // its production ISiteEnumerator (config-DB Site projection) and + // IPullAuditEventsClient (gRPC) from the central reconciliation-client + // helper registered in Program.cs. + var auditReconLogger = _serviceProvider.GetRequiredService() + .CreateLogger(); + var auditReconOptions = _serviceProvider + .GetRequiredService>(); + var auditReconSites = _serviceProvider + .GetRequiredService(); + var auditReconClient = _serviceProvider + .GetRequiredService(); + + var auditReconSingletonProps = ClusterSingletonManager.Props( + singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditReconciliationActor( + auditReconSites, + auditReconClient, + _serviceProvider, + auditReconOptions, + auditReconLogger)), + terminationMessage: PoisonPill.Instance, + settings: ClusterSingletonManagerSettings.Create(_actorSystem!) + .WithSingletonName("site-audit-reconciliation")); + var auditReconSingletonManager = + _actorSystem!.ActorOf(auditReconSingletonProps, "site-audit-reconciliation-singleton"); + + var auditReconShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem); + auditReconShutdown.AddTask( + Akka.Actor.CoordinatedShutdown.PhaseClusterLeave, + "drain-site-audit-reconciliation-singleton", + async () => + { + try + { + await auditReconSingletonManager.GracefulStop(TimeSpan.FromSeconds(10)); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "SiteAuditReconciliation singleton did not drain within the graceful-stop " + + "timeout; falling through to PoisonPill handover"); + } + return Akka.Done.Instance; + }); + + var auditReconProxyProps = ClusterSingletonProxy.Props( + singletonManagerPath: "/user/site-audit-reconciliation-singleton", + settings: ClusterSingletonProxySettings.Create(_actorSystem) + .WithSingletonName("site-audit-reconciliation")); + _actorSystem.ActorOf(auditReconProxyProps, "site-audit-reconciliation-proxy"); + _logger.LogInformation("SiteAuditReconciliationActor singleton created"); + _logger.LogInformation("Central actors registered. CentralCommunicationActor created."); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs index 8d3f36aa..5af0f3b5 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs @@ -97,6 +97,13 @@ try // pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance // (registered below by AddConfigurationDatabase). builder.Services.AddAuditLogCentralMaintenance(builder.Configuration); + // #23 M6 Bundle B/C — central-only registration backing the two + // maintenance singletons started in AkkaHostedService: the production + // ISiteEnumerator + IPullAuditEventsClient (gRPC) used by the + // SiteAuditReconciliationActor, plus the AuditLogPurgeOptions / + // SiteAuditReconciliationOptions bindings consumed by both singletons. + // Central-only by design (it dials sites), kept out of AddAuditLog. + builder.Services.AddAuditLogCentralReconciliationClient(builder.Configuration); // Site Call Audit (#22) — central node owns the SiteCallAuditActor // singleton (M3 Bundle F). The extension itself currently registers // nothing — actor Props are constructed inline in AkkaHostedService — diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs new file mode 100644 index 00000000..d5a8951b --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs @@ -0,0 +1,91 @@ +using Microsoft.Extensions.DependencyInjection; +using NSubstitute; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using SiteEntity = ZB.MOM.WW.ScadaBridge.Commons.Entities.Sites.Site; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Unit tests for the production — the central +/// reconciliation-singleton collaborator that projects the config-DB +/// rows into the targets the +/// polls. +/// +/// +/// The enumerator opens a fresh DI scope per +/// call (mirroring the per-tick scope pattern in the reconciliation actor) +/// because is a SCOPED EF Core service. The tests +/// register a substituted repository as a scoped service so the enumerator's +/// CreateAsyncScope resolves it and the projection / blank-address +/// filtering can be exercised without an MSSQL container. +/// +public class SiteEnumeratorTests +{ + private static SiteEntity SiteWith(string identifier, string? grpcNodeA, string? grpcNodeB = null) + { + var site = new SiteEntity($"Display {identifier}", identifier) + { + GrpcNodeAAddress = grpcNodeA, + GrpcNodeBAddress = grpcNodeB, + }; + return site; + } + + private static IServiceProvider BuildProvider(ISiteRepository repository) + { + var services = new ServiceCollection(); + // Scoped to match the production lifetime (EF Core); the enumerator + // must open a scope to resolve it. + services.AddScoped(_ => repository); + return services.BuildServiceProvider(); + } + + [Fact] + public async Task EnumerateAsync_ProjectsSitesWithNodeAAddress_AndSkipsBlankOnes() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List + { + SiteWith("site-a", "http://site-a:8083"), + SiteWith("site-b", grpcNodeA: " "), // blank NodeA -> skipped + }); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + var entry = Assert.Single(result); + Assert.Equal("site-a", entry.SiteId); + Assert.Equal("http://site-a:8083", entry.GrpcEndpoint); + } + + [Fact] + public async Task EnumerateAsync_SkipsNullNodeAAddress() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List + { + SiteWith("site-null", grpcNodeA: null), + }); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + Assert.Empty(result); + } + + [Fact] + public async Task EnumerateAsync_ReturnsEmpty_WhenNoSites() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List()); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + Assert.Empty(result); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs index 83d32963..8d4bceb1 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs @@ -117,6 +117,22 @@ public class CentralActorPathTests : IAsyncLifetime public async Task CentralActors_NotificationOutboxProxy_Exists() => await AssertActorExists("/user/notification-outbox-proxy"); + [Fact] + public async Task CentralActors_AuditLogPurgeSingleton_Exists() + => await AssertActorExists("/user/audit-log-purge-singleton"); + + [Fact] + public async Task CentralActors_AuditLogPurgeProxy_Exists() + => await AssertActorExists("/user/audit-log-purge-proxy"); + + [Fact] + public async Task CentralActors_SiteAuditReconciliationSingleton_Exists() + => await AssertActorExists("/user/site-audit-reconciliation-singleton"); + + [Fact] + public async Task CentralActors_SiteAuditReconciliationProxy_Exists() + => await AssertActorExists("/user/site-audit-reconciliation-proxy"); + private async Task AssertActorExists(string path) { Assert.NotNull(_actorSystem); From c092e89fd1e5f754674a164da89d1ee7d57b1b6d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 10:11:49 -0400 Subject: [PATCH 04/14] fix(audit): robust central options binding + interval clamps + doc/contract fixes (review) --- .../Central/AuditLogPurgeOptions.cs | 47 +++++++++++++--- .../Central/ISiteEnumerator.cs | 11 ++-- .../Central/SiteAuditReconciliationActor.cs | 4 ++ .../Central/SiteAuditReconciliationOptions.cs | 41 +++++++++++--- .../Central/SiteEnumerator.cs | 4 +- .../ServiceCollectionExtensions.cs | 55 ++++++++++++------- .../AkkaHostedServiceAuditWiringTests.cs | 31 +++++++++++ 7 files changed, 153 insertions(+), 40 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs index 43d5bc74..0ba1bc00 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs @@ -17,8 +17,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// /// /// exists for tests to drop the cadence to -/// milliseconds without polluting the production config surface; production -/// binds only. +/// milliseconds; production config is expected to set +/// only. Because this options class is Bind-ed wholesale, a config value +/// at AuditLog:Purge:IntervalOverride would bind if present (and would +/// bypass the minimum clamp) — operators must not set it. /// /// public sealed class AuditLogPurgeOptions @@ -29,15 +31,44 @@ public sealed class AuditLogPurgeOptions /// /// Test-only override for finer control over the tick cadence than /// whole-hour resolution allows. When non-null, takes precedence over - /// . Not bound from config — production - /// config exposes only. + /// AND bypasses the + /// minimum clamp (so tests can use millisecond cadences). Production + /// config exposes only and never sets this + /// knob — but because the options class is Bind-ed wholesale, a + /// config value at AuditLog:Purge:IntervalOverride WOULD bind if + /// present; operators must not set it. /// public TimeSpan? IntervalOverride { get; set; } /// - /// Resolves the effective tick interval, honouring the test override - /// when set. Falls back to . + /// Minimum interval the config-bound can + /// resolve to. Clamps a misconfigured IntervalHours: 0 (or a + /// negative value) away from — a zero + /// interval would make Akka's ScheduleTellRepeatedlyCancelable + /// spin, looping the partition drop/rebuild dance into a sustained SQL + /// outage. The test-only bypasses this + /// clamp so unit tests can still drop the cadence to milliseconds. /// - public TimeSpan Interval => - IntervalOverride ?? TimeSpan.FromHours(IntervalHours); + private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromMinutes(1); + + /// + /// Resolves the effective tick interval, honouring the test override + /// when set. Falls back to , clamped to at + /// least so a zero/negative config + /// value can never yield (which would spin + /// the scheduler). + /// + public TimeSpan Interval + { + get + { + if (IntervalOverride is { } overrideValue) + { + return overrideValue; + } + + var resolved = TimeSpan.FromHours(IntervalHours); + return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved; + } + } } diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs index cc8cae1f..25a5a4c7 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs @@ -9,11 +9,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// /// The production implementation wraps ISiteRepository.GetAllSitesAsync /// and projects each Site to a using the -/// site's configured GrpcNodeAAddress (falling back to -/// GrpcNodeBAddress when NodeA is unset). Sites with NO gRPC address -/// configured are silently skipped — the reconciliation pull cannot reach -/// them, but absence of an address is a configuration decision, not a runtime -/// error. +/// site's configured GrpcNodeAAddress. This is a NodeA-only first cut: +/// sites with a blank GrpcNodeAAddress are silently SKIPPED — the +/// reconciliation pull cannot reach them, but absence of an address is a +/// configuration decision, not a runtime error. NodeB-fallback endpoint +/// selection (dial NodeB when NodeA is unset/unreachable) is a follow-up +/// (mirrors the comment in SiteEnumerator.cs). /// public interface ISiteEnumerator { diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs index fb08bc57..8c6a297b 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -182,6 +182,10 @@ public class SiteAuditReconciliationActor : ReceiveActor IReadOnlyList sites; try { + // No ambient CancellationToken in a ReceiveActor message handler — + // CancellationToken.None (the EnumerateAsync default) is intentional. + // The work is bounded by the 5-min reconciliation tick plus the + // 10s graceful-stop drain on PhaseClusterLeave. sites = await _sites.EnumerateAsync().ConfigureAwait(false); } catch (Exception ex) diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs index 31796ad9..b58b3fb4 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs @@ -31,18 +31,45 @@ public sealed class SiteAuditReconciliationOptions /// /// Test-only override for finer control over the tick cadence than /// whole-second resolution allows. When non-null, takes precedence over - /// . Not bound from config — - /// production config exposes - /// only. + /// AND bypasses the + /// minimum clamp (so tests can use + /// millisecond cadences). Production config exposes + /// only and never sets this + /// knob — but because the options class is Bind-ed wholesale, a + /// config value at AuditLog:Reconciliation:ReconciliationIntervalOverride + /// WOULD bind if present; operators must not set it. /// public TimeSpan? ReconciliationIntervalOverride { get; set; } /// - /// Resolves the effective tick interval, honouring the test override when - /// set. Falls back to . + /// Minimum interval the config-bound + /// can resolve to. Clamps a misconfigured ReconciliationIntervalSeconds: 0 + /// (or a negative value) away from , which would make + /// Akka's ScheduleTellRepeatedlyCancelable spin. The test-only + /// bypasses this clamp so unit tests + /// can still drop the cadence to milliseconds. /// - public TimeSpan ReconciliationInterval => - ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromSeconds(1); + + /// + /// Resolves the effective tick interval, honouring the test override when + /// set. Falls back to , clamped to at + /// least so a zero/negative config value can + /// never yield (which would spin the scheduler). + /// + public TimeSpan ReconciliationInterval + { + get + { + if (ReconciliationIntervalOverride is { } overrideValue) + { + return overrideValue; + } + + var resolved = TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved; + } + } /// /// Maximum number of diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs index 159b4ae1..357ee4ee 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs @@ -67,7 +67,9 @@ public sealed class SiteEnumerator : ISiteEnumerator continue; } - entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress)); + // The IsNullOrWhiteSpace guard above proves GrpcNodeAAddress is + // non-null here; explicit null-forgiving for clarity. + entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress!)); } return entries; diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs index c1114c1e..632e2317 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs @@ -333,6 +333,24 @@ public static class ServiceCollectionExtensions .Bind(config.GetSection(PartitionMaintenanceSectionName)); services.AddHostedService(); + // I1 (review): bind the two central-singleton options HERE rather than in + // AddAuditLogCentralReconciliationClient. AkkaHostedService.RegisterCentralActors + // resolves IOptions / + // via GetRequiredService when it wires the AuditLogPurgeActor + + // SiteAuditReconciliationActor singletons; AddAuditLogCentralMaintenance is + // ALWAYS called on the central path (the reconciliation-client helper is the + // one that could in principle be dropped), so binding the options here means + // the singletons get a valid IOptions even if the gRPC-client helper is not + // wired — instead of a cryptic InvalidOperationException at GetRequiredService. + // Defaults are fine when the section is absent (24 h purge cadence / + // 5 min reconciliation tick); production exposes IntervalHours / + // ReconciliationIntervalSeconds only — the test-only *Override knobs are + // not intended to be set from config (see the options classes' remarks). + services.AddOptions() + .Bind(config.GetSection(PurgeSectionName)); + services.AddOptions() + .Bind(config.GetSection(ReconciliationSectionName)); + // M6 Bundle E (T8 + T9): central health snapshot — a single object // that owns the CentralAuditWriteFailures + AuditRedactionFailure // Interlocked counters AND surfaces them on @@ -397,19 +415,21 @@ public static class ServiceCollectionExtensions /// /// /// The production (, - /// wrapping the scoped ISiteRepository) IS registered here, alongside - /// the + - /// bindings — so the two central singletons wired in the Host - /// ( + ) - /// can resolve their collaborators + options from the same central-only - /// helper. Keeping the enumerator + options on this central path preserves - /// the "every Add* call is safe from any composition root" invariant: - /// a site host never calls this helper, so it never registers a - /// site-dialing enumerator. + /// wrapping the scoped ISiteRepository) IS registered here — so the + /// singleton wired in the Host can + /// resolve its enumerator + gRPC client from this central-only helper. Keeping + /// the enumerator on this central path preserves the "every Add* call is + /// safe from any composition root" invariant: a site host never calls this + /// helper, so it never registers a site-dialing enumerator. The + /// + + /// bindings live in instead (I1 + /// review fix) — that helper is unconditionally called on the central path, so + /// the two maintenance singletons get a valid IOptions even if this + /// gRPC-client helper is ever dropped. /// /// /// The service collection to register into. - /// Application configuration used to bind the purge + reconciliation options sections. + /// Application configuration used to bind the gRPC client's communication options (purge + reconciliation options are bound by ). /// The same for chaining. public static IServiceCollection AddAuditLogCentralReconciliationClient( this IServiceCollection services, @@ -425,15 +445,12 @@ public static class ServiceCollectionExtensions // in SiteAuditReconciliationActor / AuditLogPurgeActor). services.TryAddSingleton(sp => new SiteEnumerator(sp)); - // Bind the two central-singleton options to their config sections. - // Defaults are fine when the section is absent (24 h purge cadence / - // 5 min reconciliation tick); production exposes IntervalHours / - // ReconciliationIntervalSeconds only — the test-only *Override knobs - // are intentionally not bound. - services.AddOptions() - .Bind(config.GetSection(PurgeSectionName)); - services.AddOptions() - .Bind(config.GetSection(ReconciliationSectionName)); + // I1 (review): the AuditLogPurgeOptions / SiteAuditReconciliationOptions + // bindings moved to AddAuditLogCentralMaintenance — that helper is always + // called on the central path, so the two maintenance singletons resolve a + // valid IOptions even if this gRPC-client helper is ever dropped. Keep the + // ISiteEnumerator + gRPC client registrations here (they dial sites and are + // central-only by design). // The invoker owns the per-endpoint GrpcChannel cache, so it must be a // singleton — a fresh invoker per resolution would leak channels. diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs index f0f101f7..7855d876 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs @@ -7,6 +7,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Options; using ZB.MOM.WW.ScadaBridge.AuditLog; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.AuditLog.Site; using ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry; using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure; @@ -238,6 +239,36 @@ public class CentralAuditWiringTests : IDisposable Assert.NotNull(forwarder); Assert.IsType(forwarder); } + + /// + /// I4 (review): the central composition root must register the production + /// reconciliation collaborators via + /// AddAuditLogCentralReconciliationClient. Asserting the concrete + /// implementations resolve here is a faster, clearer signal than a runtime + /// "actor not found" / cryptic GetRequiredService throw in + /// AkkaHostedService.RegisterCentralActors if that helper is ever + /// dropped from Program.cs. + /// + [Fact] + public void Central_Resolves_ISiteEnumerator_AsSiteEnumerator() + { + var enumerator = _factory.Services.GetService(); + Assert.NotNull(enumerator); + Assert.IsType(enumerator); + } + + /// + /// I4 (review): companion to + /// — the production gRPC pull client must resolve on the central composition + /// root so the SiteAuditReconciliationActor singleton can dial sites. + /// + [Fact] + public void Central_Resolves_IPullAuditEventsClient_AsGrpcClient() + { + var client = _factory.Services.GetService(); + Assert.NotNull(client); + Assert.IsType(client); + } } /// From 963e3427da903d597d925b131cbcdb02014745a6 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 10:39:06 -0400 Subject: [PATCH 05/14] feat(sitecallaudit): PullSiteCalls reconciliation plumbing (store read + RPC + site handler + central client) Site Call Audit (#22): build the documented periodic reconciliation PULL self-heal path for the eventually-consistent central SiteCalls mirror, as a dedicated PullSiteCalls gRPC RPC kept separate from the audit pull. This is the pull PLUMBING only; the central reconciliation tick is a separate follow-up. - IOperationTrackingStore.ReadChangedSinceAsync(sinceUtc, batchSize): inclusive UpdatedAtUtc cursor, oldest-first, batch-capped; SQLite impl projects tracking rows onto SiteCallOperational (Kind->Channel, TargetSummary->Target, SourceSite left empty - the store has no site-id column). - sitestream.proto: rpc PullSiteCalls + PullSiteCallsRequest/Response, mirroring PullAuditEvents; regenerated checked-in SiteStreamGrpc/*.cs. - SiteCallDtoMapper.ToDto(SiteCallOperational): inverse of FromDto for the handler. - SiteStreamGrpcServer.PullSiteCalls handler + SetOperationTrackingStore seam; Host wires the seam alongside SetSiteAuditQueue (site roles only). - Central IPullSiteCallsClient + GrpcPullSiteCallsClient (home: AuditLog/Central to reuse ISiteEnumerator; SiteCallAudit does not reference AuditLog). Re-stamps SourceSite from the dialed siteId; no-throw on tolerable transport faults; SpecifyKind (not ToUniversalTime) cursor handling. Central-only DI registration. Tests: ReadChangedSinceAsync (4), PullSiteCalls handler (6), GrpcPullSiteCallsClient (8). Full solution build 0 warnings/0 errors (TreatWarningsAsErrors). --- .../Central/GrpcPullSiteCallsClient.cs | 287 ++++++++++ .../Central/IPullSiteCallsClient.cs | 57 ++ .../ServiceCollectionExtensions.cs | 25 + .../Services/IOperationTrackingStore.cs | 36 ++ .../Integration/PullSiteCallsResponse.cs | 17 + .../Grpc/SiteCallDtoMapper.cs | 64 ++- .../Grpc/SiteStreamGrpcServer.cs | 88 +++ .../Protos/sitestream.proto | 18 + .../SiteStreamGrpc/Sitestream.cs | 522 +++++++++++++++++- .../SiteStreamGrpc/SitestreamGrpc.cs | 42 +- .../Actors/AkkaHostedService.cs | 12 + .../Tracking/OperationTrackingStore.cs | 70 +++ .../Central/GrpcPullSiteCallsClientTests.cs | 215 ++++++++ .../SiteStreamPullSiteCallsTests.cs | 189 +++++++ .../Tracking/OperationTrackingStoreTests.cs | 132 +++++ 15 files changed, 1751 insertions(+), 23 deletions(-) create mode 100644 src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs create mode 100644 src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs create mode 100644 src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs create mode 100644 tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs new file mode 100644 index 00000000..b483b1a5 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs @@ -0,0 +1,287 @@ +using System.Collections.Concurrent; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Communication; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse; +using PullSiteCallsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullSiteCallsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production (Site Call Audit #22) that the +/// central reconciliation tick (a separate follow-up component) uses to pull the +/// next batch of cached-call operational rows from a site over the +/// PullSiteCalls unary gRPC RPC served by SiteStreamGrpcServer. +/// A near-exact sibling of . +/// +/// +/// +/// Endpoint resolution. The caller passes only a siteId; this +/// client resolves it to a gRPC authority via +/// () on every call so a NodeA→NodeB +/// failover flip or an edited site address takes effect on the next tick. A site +/// with no registered endpoint yields an empty response (no dial). +/// +/// +/// SourceSite re-stamp. The site leaves +/// SiteCallOperationalDto.SourceSite empty (the tracking store has no +/// site-id column). This client is the authority that knows which site it +/// dialed, so it re-stamps the mapped from +/// siteId — the same "re-stamp from the forwarder's own id" pattern the +/// site push path uses. +/// +/// +/// Fault tolerance. Per the contract, +/// tolerable transport faults (, +/// , , +/// bare / SocketException) are caught +/// and collapsed to an empty response so one offline site never sinks the rest +/// of the reconciliation tick. Any other fault (e.g. a malformed reply that +/// fails DTO mapping) is also swallowed to empty: reconciliation is best-effort. +/// +/// +/// Testability. The unary call is reached through the +/// seam. Production binds +/// (one cached +/// per endpoint, keepalive from ); unit tests +/// inject a fake invoker so no real HTTP/2 endpoint is required. +/// +/// +public sealed class GrpcPullSiteCallsClient : IPullSiteCallsClient +{ + private readonly ISiteEnumerator _sites; + private readonly IPullSiteCallsInvoker _invoker; + private readonly ILogger _logger; + + /// + /// Creates the client over the given site enumerator and unary-call invoker. + /// + /// Resolves a siteId to its gRPC endpoint. + /// Seam that issues the PullSiteCalls unary RPC against a resolved endpoint. + /// Logger for transport-fault diagnostics. + public GrpcPullSiteCallsClient( + ISiteEnumerator sites, + IPullSiteCallsInvoker invoker, + ILogger logger) + { + _sites = sites ?? throw new ArgumentNullException(nameof(sites)); + _invoker = invoker ?? throw new ArgumentNullException(nameof(invoker)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public async Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct) + { + var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false); + if (endpoint is null) + { + // No gRPC address registered for the site — a configuration decision + // (mirrors ISiteEnumerator's own contract), not a runtime error, so + // there is simply nothing to pull. + _logger.LogDebug( + "PullSiteCalls skipped: no gRPC endpoint registered for site {SiteId}.", siteId); + return Empty; + } + + var request = new ProtoPullRequest + { + // ReadChangedSinceAsync treats DateTime.MinValue as "from the start"; + // EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind). + SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)), + BatchSize = batchSize, + }; + + ProtoPullResponse reply; + try + { + reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false); + } + catch (RpcException ex) when (IsTolerable(ex.StatusCode)) + { + _logger.LogDebug(ex, + "PullSiteCalls tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.", + siteId, endpoint, ex.StatusCode); + return Empty; + } + catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException) + { + _logger.LogDebug(ex, + "PullSiteCalls connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + catch (OperationCanceledException) + { + // Reconciliation tick cancelled — caller token (host shutdown) or an + // internal gRPC deadline / linked-CTS cancellation. Both tolerable for + // a best-effort pull; collapse to empty rather than landing noisily in + // the catch-all below. + return Empty; + } + catch (Exception ex) + { + // Any other fault. Reconciliation is best-effort; swallow to empty + // rather than throw — the (future) actor's per-site guard would only + // re-catch it. + _logger.LogWarning(ex, + "PullSiteCalls unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + + // Map proto DTOs to central SiteCall entities, re-stamp SourceSite from + // the dialed siteId (the site leaves it empty), and order oldest-first by + // UpdatedAtUtc (the wire is already ordered by the site read, but the + // contract is explicit, so sort defensively). + var siteCalls = reply.Operationals + .Select(SiteCallDtoMapper.FromDto) + .Select(sc => sc with { SourceSite = siteId }) + .OrderBy(sc => sc.UpdatedAtUtc) + .ToList(); + + return new PullSiteCallsResponse(siteCalls, reply.MoreAvailable); + } + + private async Task ResolveEndpointAsync(string siteId, CancellationToken ct) + { + var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false); + foreach (var site in sites) + { + if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) && + !string.IsNullOrWhiteSpace(site.GrpcEndpoint)) + { + return site.GrpcEndpoint; + } + } + return null; + } + + private static readonly PullSiteCallsResponse Empty = + new(Array.Empty(), MoreAvailable: false); + + private static bool IsTolerable(StatusCode code) => code is + StatusCode.Unavailable or + StatusCode.DeadlineExceeded or + StatusCode.Cancelled; + + // All ScadaBridge timestamps are UTC by invariant. A non-UTC cursor (the + // reconciliation cursor starts at DateTime.MinValue, Kind=Unspecified) is + // treated AS UTC — never ToUniversalTime()-converted: on a host with a + // positive UTC offset MinValue.ToUniversalTime() underflows and + // Timestamp.FromDateTime throws, crashing the first pull for every site. + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); + + /// + /// Seam over the PullSiteCalls unary gRPC call against a resolved site + /// endpoint. Extracted so can be + /// unit-tested without a real . Production binds + /// . + /// + public interface IPullSiteCallsInvoker + { + /// + /// Issues the PullSiteCalls unary RPC against . + /// May throw / + /// on transport faults — the caller classifies and swallows tolerable ones. + /// + /// The site gRPC authority (e.g. http://site-a:8083). + /// The wire-format pull request. + /// Cancellation token. + /// The wire-format pull response. + Task InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct); + } +} + +/// +/// Production : caches +/// one per endpoint (keepalive from +/// , mirroring SiteStreamGrpcClient) and +/// issues the unary PullSiteCallsAsync call. The cache is keyed by +/// endpoint string, so a changed site address (NodeA→NodeB failover flip / an +/// edited gRPC address) is reached as soon as the resolver hands the new endpoint +/// to . The channel for a previous address lingers idle +/// until (idle channels hold no streams — a minor cache +/// footprint cost, not a correctness or liveness gap). Sibling of +/// . +/// +public sealed class GrpcPullSiteCallsInvoker + : GrpcPullSiteCallsClient.IPullSiteCallsInvoker, IDisposable +{ + private readonly ConcurrentDictionary _channels = new(StringComparer.Ordinal); + private readonly CommunicationOptions _options; + + /// Creates the invoker using default . + public GrpcPullSiteCallsInvoker() + : this(new CommunicationOptions()) + { + } + + /// + /// Creates the invoker, applying the configured gRPC keepalive settings to + /// every channel it opens. + /// + /// Communication options supplying gRPC keepalive timings. + public GrpcPullSiteCallsInvoker(CommunicationOptions options) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + public async Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + var channel = GetOrCreateChannel(endpoint); + var client = new SiteStreamService.SiteStreamServiceClient(channel); + using var call = client.PullSiteCallsAsync(request, cancellationToken: ct); + return await call.ResponseAsync.ConfigureAwait(false); + } + + // Race-safe channel cache (create-then-GetOrAdd-then-dispose-if-lost): two + // concurrent first dials of the same endpoint can both build a GrpcChannel; + // only the channel actually installed survives, the loser is disposed. + // Mirrors SiteStreamGrpcClientFactory / GrpcPullAuditEventsInvoker. + private GrpcChannel GetOrCreateChannel(string endpoint) + { + if (!_channels.TryGetValue(endpoint, out var channel)) + { + var created = CreateChannel(endpoint); + channel = _channels.GetOrAdd(endpoint, created); + if (!ReferenceEquals(channel, created)) + { + created.Dispose(); + } + } + return channel; + } + + private GrpcChannel CreateChannel(string endpoint) => + GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions + { + HttpHandler = new SocketsHttpHandler + { + KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay, + KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout, + KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always, + }, + }); + + /// Disposes all cached channels. + public void Dispose() + { + foreach (var channel in _channels.Values) + { + channel.Dispose(); + } + _channels.Clear(); + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs new file mode 100644 index 00000000..c22d5706 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs @@ -0,0 +1,57 @@ +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Mockable abstraction over the central-side PullSiteCalls gRPC client +/// surface used by the Site Call Audit (#22) reconciliation tick to fetch the +/// next batch of cached-call operational rows from a specific site — the +/// documented periodic self-heal pull that backfills the eventually-consistent +/// central SiteCalls mirror when best-effort push telemetry is lost. +/// Extracted so the (separate, follow-up) reconciliation actor can be +/// unit-tested against an in-memory stub without standing up a real +/// GrpcChannel per site. +/// +/// +/// +/// The home is ZB.MOM.WW.ScadaBridge.AuditLog.Central rather than the +/// ZB.MOM.WW.ScadaBridge.SiteCallAudit project so it can reuse the +/// / endpoint-resolution +/// abstraction that already lives here (and that the sibling +/// uses) — SiteCallAudit does not reference +/// AuditLog, so hosting the client there would mean duplicating the enumerator. +/// This mirrors the decision to keep in +/// ZB.MOM.WW.ScadaBridge.Communication. +/// +/// +/// Implementations MUST NOT throw on transport faults the reconciliation tick +/// can tolerate (connection refused, deadline exceeded, cancellation) — one +/// offline site must never sink the rest of the tick. The +/// are returned oldest-first by +/// UpdatedAtUtc with the SourceSite re-stamped from the dialed +/// site id (the site leaves it empty, being unaware of its own id), and a +/// MoreAvailable flag the caller uses to decide whether to fire another +/// pull immediately. +/// +/// +public interface IPullSiteCallsClient +{ + /// + /// Issues a PullSiteCalls RPC against the site whose gRPC endpoint is + /// registered against . Returns the next batch of + /// rows + /// ordered oldest-first (with SourceSite re-stamped from + /// ) AND a MoreAvailable flag the caller uses + /// to decide whether to fire another pull immediately. + /// + /// The identifier of the site to pull cached-call operational rows from. + /// Only rows with an UpdatedAtUtc at or after this cursor time are returned. + /// Maximum number of rows to return per call. + /// Cancellation token. + /// A task that resolves to the next reconciliation batch with a MoreAvailable flag. + Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct); +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs index 632e2317..631200a1 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs @@ -473,6 +473,31 @@ public static class ServiceCollectionExtensions sp.GetRequiredService(), sp.GetRequiredService>())); + // Site Call Audit (#22) reconciliation pull client — central-only, the + // sibling of the audit pull client above. Lives here (not in the + // SiteCallAudit project) so it can reuse the central-only + // ISiteEnumerator registered just above; SiteCallAudit does not + // reference AuditLog. The invoker owns the per-endpoint GrpcChannel + // cache, so it must be a singleton (a fresh invoker per resolution + // would leak channels). CommunicationOptions flow through when bound by + // the central Host, else defaults — mirrors the audit invoker. + services.TryAddSingleton(sp => + { + var options = sp + .GetService>(); + return options is null + ? new GrpcPullSiteCallsInvoker() + : new GrpcPullSiteCallsInvoker(options.Value); + }); + services.TryAddSingleton( + sp => sp.GetRequiredService()); + + services.TryAddSingleton(sp => new GrpcPullSiteCallsClient( + sp.GetRequiredService(), + sp.GetRequiredService(), + sp.GetRequiredService>())); + return services; } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs index b85bff5c..1d69023a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs @@ -118,4 +118,40 @@ public interface IOperationTrackingStore Task PurgeTerminalAsync( DateTime olderThanUtc, CancellationToken ct = default); + + /// + /// Reconciliation read (Site Call Audit #22): return tracking rows whose + /// UpdatedAtUtc is at or after as + /// projections, ordered by + /// UpdatedAtUtc ascending and capped at . + /// This is the site-side feed for central's PullSiteCalls RPC — the + /// documented periodic self-heal pull that backfills the eventually-consistent + /// central SiteCalls mirror when best-effort push telemetry is lost. + /// + /// + /// + /// The lower bound is inclusive so a caller can resume from the last + /// returned UpdatedAtUtc without skipping a row that shares that + /// instant; central ingest is insert-if-not-exists then upsert-on-newer, so + /// re-reading the boundary row is a harmless no-op. The oldest-first cap lets + /// the caller advance the cursor monotonically across follow-up pulls. + /// + /// + /// is left as the empty string: + /// the site id is not a tracking-store column, and the central client re-stamps + /// it from the siteId it dialed (the only authority that knows which + /// site the rows came from). is + /// projected from the row's Kind (DbWriteCached → DbOutbound, + /// otherwise ApiOutbound) and + /// from TargetSummary. + /// + /// + /// Inclusive lower bound on UpdatedAtUtc; reads from the start. + /// Maximum number of rows to return (oldest first). + /// Cancellation token. + /// The matching rows projected to , oldest-first, capped at . + Task> ReadChangedSinceAsync( + DateTime sinceUtc, + int batchSize, + CancellationToken ct = default); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs new file mode 100644 index 00000000..fa3949cb --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs @@ -0,0 +1,17 @@ +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; + +namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; + +/// +/// Site Call Audit (#22) periodic reconciliation pull response: the next batch of +/// site cached-call operational rows (the eventually-consistent SiteCalls +/// mirror's self-heal feed) plus a flag signalling +/// the caller to advance the watermark and pull again. Mirrors +/// ; carries the central +/// entity the ingest path upserts. See Component-SiteCallAudit.md. +/// +/// The next batch of operational rows, ordered oldest-first by . +/// True when the site saturated the requested batch size — the caller should advance the cursor and pull again. +public sealed record PullSiteCallsResponse( + IReadOnlyList SiteCalls, + bool MoreAvailable); diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs index ec7a0dbd..265a37eb 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs @@ -1,5 +1,6 @@ using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Types; +using Timestamp = Google.Protobuf.WellKnownTypes.Timestamp; namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc; @@ -20,10 +21,15 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc; /// Mirrors the sibling . /// /// -/// Only the DTO→entity direction is provided: nothing in the system maps a -/// back onto the wire (sites emit the operational state -/// from SiteCallOperational, never from the central -/// entity), so an entity→DTO method would be dead code. +/// Two directions are provided. rehydrates the central +/// entity central writes into the SiteCalls table. +/// projects a site-local +/// onto the wire — used by the Site Call Audit (#22) PullSiteCalls +/// reconciliation handler (the central→site self-heal pull). The +/// entity itself is never mapped back onto the wire: +/// sites emit operational state from , never +/// from the central , so a SiteCall→DTO method +/// would be dead code. /// /// /// String nullability convention: proto3 scalar strings cannot be absent, so the @@ -70,4 +76,54 @@ public static class SiteCallDtoMapper IngestedAtUtc = DateTime.UtcNow, // overwritten by AuditLogIngestActor }; } + + /// + /// Projects a site-local onto its + /// wire-format DTO for the Site Call Audit (#22) PullSiteCalls + /// reconciliation RPC. The inverse of ; null + /// / + /// collapse to empty strings (proto3 scalar strings cannot be absent), while + /// the nullable HttpStatus and TerminalAtUtc stay unset on the + /// wire so true-null semantics survive the round-trip back through + /// . + /// + /// The site-local operational state to project to wire format. + /// A populated ready for transmission. + public static SiteCallOperationalDto ToDto(SiteCallOperational operational) + { + ArgumentNullException.ThrowIfNull(operational); + + var dto = new SiteCallOperationalDto + { + TrackedOperationId = operational.TrackedOperationId.ToString(), + Channel = operational.Channel, + Target = operational.Target, + SourceSite = operational.SourceSite, + SourceNode = operational.SourceNode ?? string.Empty, + Status = operational.Status, + RetryCount = operational.RetryCount, + LastError = operational.LastError ?? string.Empty, + CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.CreatedAtUtc)), + UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.UpdatedAtUtc)), + }; + + if (operational.HttpStatus.HasValue) + { + dto.HttpStatus = operational.HttpStatus.Value; + } + + if (operational.TerminalAtUtc.HasValue) + { + dto.TerminalAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.TerminalAtUtc.Value)); + } + + return dto; + } + + // All ScadaBridge timestamps are UTC by invariant; Timestamp.FromDateTime + // requires UTC kind. Specify (never convert) so a row read back from SQLite + // with Kind=Utc passes through and a defensively-unspecified value is + // treated as the UTC it already is. Mirrors AuditEventDtoMapper.EnsureUtc. + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs index 7cb82444..f9d9aec0 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs @@ -5,7 +5,9 @@ using Grpc.Core; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ZB.MOM.WW.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; +using ZB.MOM.WW.ScadaBridge.Commons.Types; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Observability; using GrpcStatus = Grpc.Core.Status; @@ -48,6 +50,14 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // the missing queue as "nothing to ship" and returns an empty response so // central retries on its next reconciliation cycle. private ISiteAuditQueue? _siteAuditQueue; + // Site Call Audit (#22): site-local operation-tracking store handed in by + // AkkaHostedService on site roles so the central reconciliation puller's + // PullSiteCalls RPC can read tracking rows changed since a cursor. Null + // when not wired (central-only host or test composing the server in + // isolation) — the handler treats the missing store as "nothing to ship" + // and returns an empty response so central retries on its next cycle. + // Mirrors _siteAuditQueue. + private IOperationTrackingStore? _operationTrackingStore; /// /// Test-only constructor — kept internal so the DI container sees a @@ -137,6 +147,21 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase _siteAuditQueue = queue; } + /// + /// Hands the site-local (the same + /// OperationTrackingStore singleton that backs + /// Tracking.Status(id) on the script thread) to the gRPC server so + /// the Site Call Audit (#22) RPC can serve + /// central's reconciliation pulls. Mirrors : + /// wired post-construction because the store and the gRPC server are both + /// DI singletons brought up in independent orders on site startup. + /// + /// The site operation-tracking store for serving reconciliation pulls. + public void SetOperationTrackingStore(IOperationTrackingStore store) + { + _operationTrackingStore = store; + } + /// /// Host-017 / REQ-HOST-7: signals the gRPC server to begin its part of the /// site shutdown sequence — refuse new @@ -488,6 +513,69 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase return response; } + /// + public override async Task PullSiteCalls( + PullSiteCallsRequest request, + ServerCallContext context) + { + var store = _operationTrackingStore; + if (store is null) + { + _logger.LogWarning( + "PullSiteCalls invoked before SetOperationTrackingStore was called; returning empty response."); + return new PullSiteCallsResponse(); + } + + if (request.BatchSize <= 0) + { + // Mirrors PullAuditEvents: reject malformed requests cleanly with + // InvalidArgument so the caller doesn't see a generic RpcException + // from the underlying SQLite parameter validation. + throw new RpcException(new GrpcStatus( + StatusCode.InvalidArgument, "batch_size must be > 0")); + } + + // since_utc defaults to DateTime.MinValue when the wrapper is absent — + // i.e. "pull from the beginning of recorded history", the intended + // behaviour for the very first reconciliation cycle. ToUniversalTime + // is safe here (the wire value is always a real UTC Timestamp, never the + // unspecified-MinValue the central client guards against on its side). + var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + + IReadOnlyList operationals; + try + { + operationals = await store.ReadChangedSinceAsync( + since, request.BatchSize, context.CancellationToken); + } + catch (Exception ex) + { + // Best-effort, like PullAuditEvents: a read fault must never abort + // the reconciliation tick — central retries on its next cycle. + _logger.LogError(ex, + "ReadChangedSinceAsync failed for since={Since} batch={Batch}; returning empty response.", + since, request.BatchSize); + return new PullSiteCallsResponse(); + } + + var response = new PullSiteCallsResponse + { + // batch_size saturated → tell central to issue a follow-up pull with + // an advanced cursor. The site doesn't compute the cursor — central + // walks it forward from the last returned UpdatedAtUtc. Unlike + // PullAuditEvents there is no MarkReconciled step: the tracking store + // is the operational source of truth and the central SiteCalls mirror + // is upsert-on-newer, so re-reading rows is a harmless no-op. + MoreAvailable = operationals.Count >= request.BatchSize, + }; + foreach (var op in operationals) + { + response.Operationals.Add(SiteCallDtoMapper.ToDto(op)); + } + + return response; + } + /// /// Tracks a single active stream so cleanup only removes its own entry. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto b/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto index 6beae55c..df9ee7af 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto @@ -10,6 +10,7 @@ service SiteStreamService { rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck); rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck); rpc PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse); + rpc PullSiteCalls(PullSiteCallsRequest) returns (PullSiteCallsResponse); } message InstanceStreamRequest { @@ -157,3 +158,20 @@ message PullAuditEventsResponse { repeated AuditEventDto events = 1; bool more_available = 2; } + +// Site Call Audit (#22) reconciliation pull: central→site request for any +// site-local operation-tracking rows whose UpdatedAtUtc >= since_utc — the +// self-heal feed that backfills the eventually-consistent central SiteCalls +// mirror when best-effort push telemetry is lost. Mirrors PullAuditEvents +// but is a SEPARATE RPC (the tracking store is the operational source of +// truth, distinct from the site audit queue). more_available signals +// batch_size was saturated so the caller advances since_utc and pulls again. +message PullSiteCallsRequest { + google.protobuf.Timestamp since_utc = 1; + int32 batch_size = 2; +} + +message PullSiteCallsResponse { + repeated SiteCallOperationalDto operationals = 1; + bool more_available = 2; +} diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs index cebfccab..a0e79003 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs @@ -81,23 +81,30 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { "dWVzdBItCglzaW5jZV91dGMYASABKAsyGi5nb29nbGUucHJvdG9idWYuVGlt", "ZXN0YW1wEhIKCmJhdGNoX3NpemUYAiABKAUiXAoXUHVsbEF1ZGl0RXZlbnRz", "UmVzcG9uc2USKQoGZXZlbnRzGAEgAygLMhkuc2l0ZXN0cmVhbS5BdWRpdEV2", - "ZW50RHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIgASgIKlwKB1F1YWxpdHkSFwoT", - "UVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFVQUxJVFlfR09PRBABEhUKEVFV", - "QUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElUWV9CQUQQAypdCg5BbGFybVN0", - "YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQRUNJRklFRBAAEhYKEkFMQVJN", - "X1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NUQVRFX0FDVElWRRACKoUBCg5B", - "bGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZFTF9OT05FEAASEwoPQUxBUk1f", - "TEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxfTE9XX0xPVxACEhQKEEFMQVJN", - "X0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZFTF9ISUdIX0hJR0gQBDLhAgoR", - "U2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2NyaWJlSW5zdGFuY2USIS5zaXRl", - "c3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVzdBobLnNpdGVzdHJlYW0uU2l0", - "ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVkaXRFdmVudHMSGy5zaXRlc3Ry", - "ZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVzdHJlYW0uSW5nZXN0QWNrElAK", - "FUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNpdGVzdHJlYW0uQ2FjaGVkVGVs", - "ZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLkluZ2VzdEFjaxJaCg9QdWxsQXVk", - "aXRFdmVudHMSIi5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50c1JlcXVlc3Qa", - "Iy5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50c1Jlc3BvbnNlQiuqAihaQi5N", - "T00uV1cuU2NhZGFCcmlkZ2UuQ29tbXVuaWNhdGlvbi5HcnBjYgZwcm90bzM=")); + "ZW50RHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIgASgIIlkKFFB1bGxTaXRlQ2Fs", + "bHNSZXF1ZXN0Ei0KCXNpbmNlX3V0YxgBIAEoCzIaLmdvb2dsZS5wcm90b2J1", + "Zi5UaW1lc3RhbXASEgoKYmF0Y2hfc2l6ZRgCIAEoBSJpChVQdWxsU2l0ZUNh", + "bGxzUmVzcG9uc2USOAoMb3BlcmF0aW9uYWxzGAEgAygLMiIuc2l0ZXN0cmVh", + "bS5TaXRlQ2FsbE9wZXJhdGlvbmFsRHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIg", + "ASgIKlwKB1F1YWxpdHkSFwoTUVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFV", + "QUxJVFlfR09PRBABEhUKEVFVQUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElU", + "WV9CQUQQAypdCg5BbGFybVN0YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQ", + "RUNJRklFRBAAEhYKEkFMQVJNX1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NU", + "QVRFX0FDVElWRRACKoUBCg5BbGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZF", + "TF9OT05FEAASEwoPQUxBUk1fTEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxf", + "TE9XX0xPVxACEhQKEEFMQVJNX0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZF", + "TF9ISUdIX0hJR0gQBDK3AwoRU2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2Ny", + "aWJlSW5zdGFuY2USIS5zaXRlc3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVz", + "dBobLnNpdGVzdHJlYW0uU2l0ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVk", + "aXRFdmVudHMSGy5zaXRlc3RyZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVz", + "dHJlYW0uSW5nZXN0QWNrElAKFUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNp", + "dGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLklu", + "Z2VzdEFjaxJaCg9QdWxsQXVkaXRFdmVudHMSIi5zaXRlc3RyZWFtLlB1bGxB", + "dWRpdEV2ZW50c1JlcXVlc3QaIy5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50", + "c1Jlc3BvbnNlElQKDVB1bGxTaXRlQ2FsbHMSIC5zaXRlc3RyZWFtLlB1bGxT", + "aXRlQ2FsbHNSZXF1ZXN0GiEuc2l0ZXN0cmVhbS5QdWxsU2l0ZUNhbGxzUmVz", + "cG9uc2VCK6oCKFpCLk1PTS5XVy5TY2FkYUJyaWRnZS5Db21tdW5pY2F0aW9u", + "LkdycGNiBnByb3RvMw==")); descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, new pbr::FileDescriptor[] { global::Google.Protobuf.WellKnownTypes.TimestampReflection.Descriptor, global::Google.Protobuf.WellKnownTypes.WrappersReflection.Descriptor, }, new pbr::GeneratedClrTypeInfo(new[] {typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.Quality), typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.AlarmStateEnum), typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.AlarmLevelEnum), }, null, new pbr::GeneratedClrTypeInfo[] { @@ -112,7 +119,9 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryPacket), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryPacket.Parser, new[]{ "AuditEvent", "Operational" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryBatch), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null) + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse.Parser, new[]{ "Operationals", "MoreAvailable" }, null, null, null, null) })); } #endregion @@ -5064,6 +5073,483 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { } + /// + /// Site Call Audit (#22) reconciliation pull: central→site request for any + /// site-local operation-tracking rows whose UpdatedAtUtc >= since_utc — the + /// self-heal feed that backfills the eventually-consistent central SiteCalls + /// mirror when best-effort push telemetry is lost. Mirrors PullAuditEvents + /// but is a SEPARATE RPC (the tracking store is the operational source of + /// truth, distinct from the site audit queue). more_available signals + /// batch_size was saturated so the caller advances since_utc and pulls again. + /// + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullSiteCallsRequest : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullSiteCallsRequest()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[12]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest(PullSiteCallsRequest other) : this() { + sinceUtc_ = other.sinceUtc_ != null ? other.sinceUtc_.Clone() : null; + batchSize_ = other.batchSize_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest Clone() { + return new PullSiteCallsRequest(this); + } + + /// Field number for the "since_utc" field. + public const int SinceUtcFieldNumber = 1; + private global::Google.Protobuf.WellKnownTypes.Timestamp sinceUtc_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public global::Google.Protobuf.WellKnownTypes.Timestamp SinceUtc { + get { return sinceUtc_; } + set { + sinceUtc_ = value; + } + } + + /// Field number for the "batch_size" field. + public const int BatchSizeFieldNumber = 2; + private int batchSize_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int BatchSize { + get { return batchSize_; } + set { + batchSize_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullSiteCallsRequest); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullSiteCallsRequest other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if (!object.Equals(SinceUtc, other.SinceUtc)) return false; + if (BatchSize != other.BatchSize) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + if (sinceUtc_ != null) hash ^= SinceUtc.GetHashCode(); + if (BatchSize != 0) hash ^= BatchSize.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + if (sinceUtc_ != null) { + size += 1 + pb::CodedOutputStream.ComputeMessageSize(SinceUtc); + } + if (BatchSize != 0) { + size += 1 + pb::CodedOutputStream.ComputeInt32Size(BatchSize); + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullSiteCallsRequest other) { + if (other == null) { + return; + } + if (other.sinceUtc_ != null) { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + SinceUtc.MergeFrom(other.SinceUtc); + } + if (other.BatchSize != 0) { + BatchSize = other.BatchSize; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + } + #endif + + } + + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullSiteCallsResponse : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullSiteCallsResponse()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[13]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse(PullSiteCallsResponse other) : this() { + operationals_ = other.operationals_.Clone(); + moreAvailable_ = other.moreAvailable_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse Clone() { + return new PullSiteCallsResponse(this); + } + + /// Field number for the "operationals" field. + public const int OperationalsFieldNumber = 1; + private static readonly pb::FieldCodec _repeated_operationals_codec + = pb::FieldCodec.ForMessage(10, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteCallOperationalDto.Parser); + private readonly pbc::RepeatedField operationals_ = new pbc::RepeatedField(); + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public pbc::RepeatedField Operationals { + get { return operationals_; } + } + + /// Field number for the "more_available" field. + public const int MoreAvailableFieldNumber = 2; + private bool moreAvailable_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool MoreAvailable { + get { return moreAvailable_; } + set { + moreAvailable_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullSiteCallsResponse); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullSiteCallsResponse other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if(!operationals_.Equals(other.operationals_)) return false; + if (MoreAvailable != other.MoreAvailable) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + hash ^= operationals_.GetHashCode(); + if (MoreAvailable != false) hash ^= MoreAvailable.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + operationals_.WriteTo(output, _repeated_operationals_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + operationals_.WriteTo(ref output, _repeated_operationals_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + size += operationals_.CalculateSize(_repeated_operationals_codec); + if (MoreAvailable != false) { + size += 1 + 1; + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullSiteCallsResponse other) { + if (other == null) { + return; + } + operationals_.Add(other.operationals_); + if (other.MoreAvailable != false) { + MoreAvailable = other.MoreAvailable; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + operationals_.AddEntriesFrom(input, _repeated_operationals_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + operationals_.AddEntriesFrom(ref input, _repeated_operationals_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + } + #endif + + } + #endregion } diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs index 8993b16a..b57de38e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs @@ -59,6 +59,10 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullSiteCallsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullSiteCallsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Method __Method_SubscribeInstance = new grpc::Method( @@ -92,6 +96,14 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { __Marshaller_sitestream_PullAuditEventsRequest, __Marshaller_sitestream_PullAuditEventsResponse); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Method __Method_PullSiteCalls = new grpc::Method( + grpc::MethodType.Unary, + __ServiceName, + "PullSiteCalls", + __Marshaller_sitestream_PullSiteCallsRequest, + __Marshaller_sitestream_PullSiteCallsResponse); + /// Service descriptor public static global::Google.Protobuf.Reflection.ServiceDescriptor Descriptor { @@ -126,6 +138,12 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::System.Threading.Tasks.Task PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::ServerCallContext context) + { + throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); + } + } /// Client for SiteStreamService @@ -225,6 +243,26 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { { return CallInvoker.AsyncUnaryCall(__Method_PullAuditEvents, null, options, request); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullSiteCalls(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::CallOptions options) + { + return CallInvoker.BlockingUnaryCall(__Method_PullSiteCalls, null, options, request); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullSiteCallsAsync(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullSiteCallsAsync(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullSiteCallsAsync(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::CallOptions options) + { + return CallInvoker.AsyncUnaryCall(__Method_PullSiteCalls, null, options, request); + } /// Creates a new instance of client from given ClientBaseConfiguration. [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] protected override SiteStreamServiceClient NewInstance(ClientBaseConfiguration configuration) @@ -242,7 +280,8 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { .AddMethod(__Method_SubscribeInstance, serviceImpl.SubscribeInstance) .AddMethod(__Method_IngestAuditEvents, serviceImpl.IngestAuditEvents) .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry) - .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents).Build(); + .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents) + .AddMethod(__Method_PullSiteCalls, serviceImpl.PullSiteCalls).Build(); } /// Register service method with a service binder with or without implementation. Useful when customizing the service binding logic. @@ -256,6 +295,7 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { serviceBinder.AddMethod(__Method_IngestAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestAuditEvents)); serviceBinder.AddMethod(__Method_IngestCachedTelemetry, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestCachedTelemetry)); serviceBinder.AddMethod(__Method_PullAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullAuditEvents)); + serviceBinder.AddMethod(__Method_PullSiteCalls, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullSiteCalls)); } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs index 864f2e63..2c45478f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs @@ -1009,6 +1009,18 @@ akka {{ // direction one-way (Host knows both; Communication doesn't reach back // into AuditLog). grpcServer?.SetSiteAuditQueue(siteAuditQueue); + // Site Call Audit (#22): hand the site-local OperationTrackingStore to + // the gRPC server so the PullSiteCalls reconciliation RPC can serve + // central's self-heal pulls. siteTrackingStore is resolved above with + // GetService — present on site composition roots, null on central — so + // wire the seam only when the store exists. Like SetSiteAuditQueue, both + // the store and the gRPC server are singletons; wiring here keeps the + // dependency direction one-way (Host knows both; Communication doesn't + // reach back into SiteRuntime). + if (siteTrackingStore is not null) + { + grpcServer?.SetOperationTrackingStore(siteTrackingStore); + } grpcServer?.SetReady(_actorSystem!); } } diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs index 8cec600b..b3e48c54 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs @@ -360,6 +360,76 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, } } + /// + public async Task> ReadChangedSinceAsync( + DateTime sinceUtc, + int batchSize, + CancellationToken ct = default) + { + ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this); + + // SiteRuntime-024: like GetStatusAsync, the reconciliation pull opens a + // fresh, ungated read connection so a long-running write never blocks + // central's PullSiteCalls. The query is a bounded, ordered scan over the + // (Status, UpdatedAtUtc) index range — UpdatedAtUtc is the cursor. + await using var readConnection = new SqliteConnection(_connectionString); + await readConnection.OpenAsync(ct).ConfigureAwait(false); + + await using var cmd = readConnection.CreateCommand(); + // Inclusive lower bound on UpdatedAtUtc (>=) so a caller resuming from + // the last returned timestamp does not skip a row sharing that instant; + // central ingest is insert-if-not-exists + upsert-on-newer, so the + // boundary row re-read is a no-op. ORDER BY ... ASC + LIMIT yields the + // OLDEST matching rows so the cursor advances monotonically. + cmd.CommandText = """ + SELECT TrackedOperationId, Kind, TargetSummary, Status, + RetryCount, LastError, HttpStatus, + CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc, SourceNode + FROM OperationTracking + WHERE UpdatedAtUtc >= $since + ORDER BY UpdatedAtUtc ASC + LIMIT $batchSize; + """; + cmd.Parameters.AddWithValue( + "$since", + sinceUtc.ToString("o", CultureInfo.InvariantCulture)); + cmd.Parameters.AddWithValue("$batchSize", batchSize); + + var rows = new List(); + await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false); + while (await reader.ReadAsync(ct).ConfigureAwait(false)) + { + var kind = reader.GetString(1); + rows.Add(new SiteCallOperational( + TrackedOperationId: TrackedOperationId.Parse(reader.GetString(0)), + Channel: KindToChannel(kind), + Target: reader.IsDBNull(2) ? string.Empty : reader.GetString(2), + // The site id is not a tracking-store column; the central client + // re-stamps SourceSite from the siteId it dialed. + SourceSite: string.Empty, + SourceNode: reader.IsDBNull(10) ? null : reader.GetString(10), + Status: reader.GetString(3), + RetryCount: reader.GetInt32(4), + LastError: reader.IsDBNull(5) ? null : reader.GetString(5), + HttpStatus: reader.IsDBNull(6) ? null : reader.GetInt32(6), + CreatedAtUtc: ParseUtc(reader.GetString(7)), + UpdatedAtUtc: ParseUtc(reader.GetString(8)), + TerminalAtUtc: reader.IsDBNull(9) ? null : ParseUtc(reader.GetString(9)))); + } + + return rows; + } + + // Cached-call Kind → SiteCalls Channel. Only ApiCallCached / DbWriteCached + // ever reach the tracking store (RecordEnqueueAsync is the cached-call + // entry point); DbWriteCached maps to DbOutbound, everything else to the + // ApiOutbound default. Mirrors CachedCallLifecycleBridge's channel handling. + private static string KindToChannel(string kind) => kind switch + { + nameof(Commons.Types.Enums.AuditKind.DbWriteCached) => nameof(Commons.Types.Enums.AuditChannel.DbOutbound), + _ => nameof(Commons.Types.Enums.AuditChannel.ApiOutbound), + }; + private static DateTime ParseUtc(string raw) { return DateTime.Parse( diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs new file mode 100644 index 00000000..982a9923 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs @@ -0,0 +1,215 @@ +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Tests for — the production +/// that dials a site over gRPC and issues the +/// PullSiteCalls unary RPC for the Site Call Audit (#22) reconciliation +/// loop. The real GrpcChannel is replaced by an injected +/// seam so the +/// client's mapping / ordering / SourceSite-restamp / fault-swallowing behaviour +/// can be asserted without standing up a Kestrel HTTP/2 endpoint. Mirrors +/// . +/// +public class GrpcPullSiteCallsClientTests +{ + private static readonly DateTime BaseTime = + new(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + private sealed class FakeInvoker : GrpcPullSiteCallsClient.IPullSiteCallsInvoker + { + public string? Endpoint { get; private set; } + public ProtoPullRequest? Request { get; private set; } + public int CallCount { get; private set; } + + private readonly ProtoPullResponse? _response; + private readonly Exception? _throw; + + private FakeInvoker(ProtoPullResponse? response, Exception? toThrow) + { + _response = response; + _throw = toThrow; + } + + public static FakeInvoker Returning(ProtoPullResponse response) => new(response, null); + public static FakeInvoker Throwing(Exception ex) => new(null, ex); + + public Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + CallCount++; + Endpoint = endpoint; + Request = request; + if (_throw is not null) + { + throw _throw; + } + return Task.FromResult(_response!); + } + } + + // The site leaves SourceSite empty (it is not a tracking-store column); the + // client re-stamps it from the dialed siteId. Mint DTOs with empty SourceSite + // to prove that re-stamp. + private static SiteCallOperationalDto Dto(Guid id, DateTime updatedAtUtc) => + new() + { + TrackedOperationId = id.ToString(), + Channel = "ApiOutbound", + Target = "ERP.GetOrder", + SourceSite = string.Empty, + SourceNode = "node-a", + Status = "Attempted", + RetryCount = 1, + LastError = string.Empty, + CreatedAtUtc = Timestamp.FromDateTime(BaseTime), + UpdatedAtUtc = Timestamp.FromDateTime(updatedAtUtc), + }; + + [Fact] + public async Task PullAsync_dials_resolved_endpoint_maps_oldest_first_and_restamps_source_site() + { + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + // Wire delivered newest-first on purpose to prove the client sorts. + var proto = new ProtoPullResponse { MoreAvailable = true }; + proto.Operationals.Add(Dto(newer, BaseTime.AddMinutes(5))); + proto.Operationals.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + // Endpoint resolution + request shaping. + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(256, invoker.Request!.BatchSize); + Assert.Equal(BaseTime, invoker.Request.SinceUtc.ToDateTime()); + + // Mapping + ordering + MoreAvailable surface. + Assert.True(result.MoreAvailable); + Assert.Equal(2, result.SiteCalls.Count); + Assert.Equal(older, result.SiteCalls[0].TrackedOperationId.Value); + Assert.Equal(newer, result.SiteCalls[1].TrackedOperationId.Value); + + // SourceSite re-stamped from the dialed siteId (DTO carried empty). + Assert.Equal("site-a", result.SiteCalls[0].SourceSite); + Assert.Equal("site-a", result.SiteCalls[1].SourceSite); + + // Round-tripped fields survive FromDto. + Assert.Equal("ApiOutbound", result.SiteCalls[0].Channel); + Assert.Equal("node-a", result.SiteCalls[0].SourceNode); + Assert.Equal(1, result.SiteCalls[0].RetryCount); + } + + [Fact] + public async Task PullAsync_returns_empty_when_site_endpoint_is_unknown() + { + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(), // no sites registered + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + Assert.Equal(0, invoker.CallCount); // never dialled — nothing to dial + } + + [Theory] + [InlineData(StatusCode.Unavailable)] + [InlineData(StatusCode.DeadlineExceeded)] + [InlineData(StatusCode.Cancelled)] + public async Task PullAsync_swallows_tolerable_transport_faults_to_empty_response(StatusCode code) + { + var invoker = FakeInvoker.Throwing(new RpcException(new Status(code, "transport fault"))); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_connection_layer_faults_to_empty_response() + { + var invoker = FakeInvoker.Throwing(new HttpRequestException("connection refused")); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_unexpected_faults_to_empty_response() + { + var invoker = FakeInvoker.Throwing(new InvalidOperationException("boom")); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_with_minvalue_unspecified_cursor_does_not_throw_and_dials() + { + // The reconciliation cursor starts at DateTime.MinValue with + // Kind=Unspecified. EnsureUtc must treat it AS UTC (per the system-wide + // invariant) and NOT call ToUniversalTime() — on a host with a positive + // UTC offset that underflows and Timestamp.FromDateTime throws, crashing + // the FIRST pull for every site. + var minUnspecified = default(DateTime); + Assert.Equal(DateTimeKind.Unspecified, minUnspecified.Kind); + + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", minUnspecified, batchSize: 256, CancellationToken.None); + + Assert.Equal(1, invoker.CallCount); + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(DateTime.MinValue, invoker.Request!.SinceUtc.ToDateTime()); + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs new file mode 100644 index 00000000..e6b2bb99 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs @@ -0,0 +1,189 @@ +using Akka.TestKit.Xunit2; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; +using NSubstitute.ExceptionExtensions; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; + +namespace ZB.MOM.WW.ScadaBridge.Communication.Tests; + +/// +/// Tests for (Site Call Audit +/// #22 reconciliation handler). Verifies the request → +/// → response +/// round-trip through the gRPC handler. The store is an NSubstitute stub so the +/// tests never touch SQLite. Mirrors +/// — but there is no MarkReconciled step (the tracking store is the operational +/// source of truth; the central SiteCalls mirror is upsert-on-newer). +/// +public class SiteStreamPullSiteCallsTests : TestKit +{ + private readonly ISiteStreamSubscriber _subscriber = Substitute.For(); + + private SiteStreamGrpcServer CreateServer() => + new(_subscriber, NullLogger.Instance); + + private static ServerCallContext NewContext(CancellationToken ct = default) + { + var context = Substitute.For(); + context.CancellationToken.Returns(ct); + return context; + } + + private static SiteCallOperational NewOperational() => + new( + TrackedOperationId: TrackedOperationId.New(), + Channel: "ApiOutbound", + Target: "ERP.GetOrder", + SourceSite: string.Empty, + SourceNode: "node-a", + Status: "Attempted", + RetryCount: 1, + LastError: null, + HttpStatus: 503, + CreatedAtUtc: DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 0, 0), DateTimeKind.Utc), + UpdatedAtUtc: DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 1, 0), DateTimeKind.Utc), + TerminalAtUtc: null); + + [Fact] + public async Task PullSiteCalls_NoStoreWired_ReturnsEmptyResponse() + { + var server = CreateServer(); + // Intentionally do NOT call SetOperationTrackingStore — simulates a + // central-only host or a wiring-incomplete startup window. + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddMinutes(-5)), + BatchSize = 100, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + } + + [Fact] + public async Task PullSiteCalls_With5Rows_ReturnsAllFiveDtos() + { + var store = Substitute.For(); + var rows = Enumerable.Range(0, 5).Select(_ => NewOperational()).ToList(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)rows); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, // larger than returned count so MoreAvailable should be false + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Equal(5, response.Operationals.Count); + Assert.False(response.MoreAvailable); // 5 < 100 + var expectedIds = rows.Select(r => r.TrackedOperationId.ToString()).ToHashSet(); + Assert.True(expectedIds.SetEquals(response.Operationals.Select(d => d.TrackedOperationId).ToHashSet())); + } + + [Fact] + public async Task PullSiteCalls_PassesSinceUtcThroughVerbatim() + { + var store = Substitute.For(); + var capturedSince = DateTime.MinValue; + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + capturedSince = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var since = DateTime.SpecifyKind(new DateTime(2026, 5, 20, 9, 30, 0), DateTimeKind.Utc); + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(since), + BatchSize = 50, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + Assert.Equal(since, capturedSince); + } + + [Fact] + public async Task PullSiteCalls_BatchSize3_Returns3Rows_MoreAvailableTrue() + { + var store = Substitute.For(); + var rows = Enumerable.Range(0, 3).Select(_ => NewOperational()).ToList(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)rows); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 3, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Equal(3, response.Operationals.Count); + // saturated batch → central needs to know to issue a follow-up pull + Assert.True(response.MoreAvailable); + } + + [Fact] + public async Task PullSiteCalls_NonPositiveBatchSize_ThrowsInvalidArgument() + { + var store = Substitute.For(); + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 0, + }; + + var ex = await Assert.ThrowsAsync( + () => server.PullSiteCalls(request, NewContext())); + Assert.Equal(StatusCode.InvalidArgument, ex.StatusCode); + } + + [Fact] + public async Task PullSiteCalls_ReadThrows_ReturnsEmptyResponse() + { + // Best-effort: a read fault must never abort the reconciliation tick. + var store = Substitute.For(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .ThrowsAsync(new InvalidOperationException("SQLite disposed mid-call")); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, + }; + + // Must NOT throw — the handler swallows the fault to an empty response. + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs index f9425ec8..a89fc398 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs @@ -439,6 +439,138 @@ public class OperationTrackingStoreTests Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal) } + // ── Site Call Audit #22: ReadChangedSinceAsync (reconciliation pull) ─── + + [Fact] + public async Task ReadChangedSinceAsync_ReturnsRowsAtOrAfterCursor_OldestFirst() + { + var (store, dataSource) = CreateStore(nameof(ReadChangedSinceAsync_ReturnsRowsAtOrAfterCursor_OldestFirst)); + await using var _store = store; + + // Three rows with distinct UpdatedAtUtc, written out of chronological + // order to prove the read sorts by UpdatedAtUtc ascending. + var older = TrackedOperationId.New(); + var middle = TrackedOperationId.New(); + var newer = TrackedOperationId.New(); + await store.RecordEnqueueAsync(older, nameof(AuditKind.ApiCallCached), "ERP.A", null, null, "node-a"); + await store.RecordEnqueueAsync(middle, nameof(AuditKind.DbWriteCached), "DB.B", null, null, "node-b"); + await store.RecordEnqueueAsync(newer, nameof(AuditKind.ApiCallCached), "ERP.C", null, null, null); + + // Backdate UpdatedAtUtc so the ordering is deterministic and a cursor + // can be placed cleanly between rows. (Enqueue stamps DateTime.UtcNow; + // we cannot inject the clock, so set the timestamps directly.) + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + SetUpdatedAt(dataSource, older, t0); + SetUpdatedAt(dataSource, middle, t0.AddMinutes(10)); + SetUpdatedAt(dataSource, newer, t0.AddMinutes(20)); + + // Cursor at the middle row's UpdatedAtUtc: inclusive lower bound, so + // middle + newer come back, older is excluded. + var result = await store.ReadChangedSinceAsync(t0.AddMinutes(10), batchSize: 100, CancellationToken.None); + + Assert.Equal(2, result.Count); + Assert.Equal(middle, result[0].TrackedOperationId); + Assert.Equal(newer, result[1].TrackedOperationId); + Assert.True(result[0].UpdatedAtUtc <= result[1].UpdatedAtUtc); + } + + [Fact] + public async Task ReadChangedSinceAsync_FromMinValue_ReturnsAllRows() + { + var (store, _) = CreateStore(nameof(ReadChangedSinceAsync_FromMinValue_ReturnsAllRows)); + await using var _store = store; + + await store.RecordEnqueueAsync(TrackedOperationId.New(), nameof(AuditKind.ApiCallCached), "A", null, null, null); + await store.RecordEnqueueAsync(TrackedOperationId.New(), nameof(AuditKind.ApiCallCached), "B", null, null, null); + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 100, CancellationToken.None); + + Assert.Equal(2, result.Count); + } + + [Fact] + public async Task ReadChangedSinceAsync_IsBatchCapped() + { + var (store, dataSource) = CreateStore(nameof(ReadChangedSinceAsync_IsBatchCapped)); + await using var _store = store; + + var ids = new List(); + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + for (var i = 0; i < 5; i++) + { + var id = TrackedOperationId.New(); + ids.Add(id); + await store.RecordEnqueueAsync(id, nameof(AuditKind.ApiCallCached), $"T{i}", null, null, null); + SetUpdatedAt(dataSource, id, t0.AddMinutes(i)); + } + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 3, CancellationToken.None); + + // Capped to 3 — and the cap takes the OLDEST 3 (asc order) so the + // caller can advance the cursor monotonically across follow-up pulls. + Assert.Equal(3, result.Count); + Assert.Equal(ids[0], result[0].TrackedOperationId); + Assert.Equal(ids[1], result[1].TrackedOperationId); + Assert.Equal(ids[2], result[2].TrackedOperationId); + } + + [Fact] + public async Task ReadChangedSinceAsync_MapsTrackingRowOntoSiteCallOperational() + { + var (store, _) = CreateStore(nameof(ReadChangedSinceAsync_MapsTrackingRowOntoSiteCallOperational)); + await using var _store = store; + + var apiId = TrackedOperationId.New(); + var dbId = TrackedOperationId.New(); + await store.RecordEnqueueAsync(apiId, nameof(AuditKind.ApiCallCached), "ERP.GetOrder", "inst-1", "ScriptActor:OnTick", "node-a"); + await store.RecordEnqueueAsync(dbId, nameof(AuditKind.DbWriteCached), "Historian.Write", null, null, "node-b"); + await store.RecordAttemptAsync(apiId, nameof(AuditStatus.Attempted), 2, "HTTP 503", 503); + await store.RecordTerminalAsync(dbId, nameof(AuditStatus.Parked), "max retries", null); + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 100, CancellationToken.None); + var api = result.Single(r => r.TrackedOperationId == apiId); + var db = result.Single(r => r.TrackedOperationId == dbId); + + // Kind → Channel projection. + Assert.Equal("ApiOutbound", api.Channel); + Assert.Equal("DbOutbound", db.Channel); + + // TargetSummary → Target; SourceNode carried verbatim. + Assert.Equal("ERP.GetOrder", api.Target); + Assert.Equal("node-a", api.SourceNode); + Assert.Equal("node-b", db.SourceNode); + + // Status / RetryCount / LastError / HttpStatus carried through. + Assert.Equal(nameof(AuditStatus.Attempted), api.Status); + Assert.Equal(2, api.RetryCount); + Assert.Equal("HTTP 503", api.LastError); + Assert.Equal(503, api.HttpStatus); + + // SourceSite is left empty by the store (the site id is not a tracking + // column); the central client re-stamps it from the dialed siteId. + Assert.Equal(string.Empty, api.SourceSite); + + // Terminal row carries TerminalAtUtc (UTC kind); active row leaves it null. + Assert.Null(api.TerminalAtUtc); + Assert.NotNull(db.TerminalAtUtc); + Assert.Equal(DateTimeKind.Utc, db.TerminalAtUtc!.Value.Kind); + + // Timestamps round-trip as UTC. + Assert.Equal(DateTimeKind.Utc, api.CreatedAtUtc.Kind); + Assert.Equal(DateTimeKind.Utc, api.UpdatedAtUtc.Kind); + } + + /// Directly sets a row's UpdatedAtUtc so cursor/ordering tests are deterministic. + private static void SetUpdatedAt(string dataSource, TrackedOperationId id, DateTime updatedAtUtc) + { + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "UPDATE OperationTracking SET UpdatedAtUtc = $u WHERE TrackedOperationId = $id;"; + cmd.Parameters.AddWithValue("$u", updatedAtUtc.ToString("o", System.Globalization.CultureInfo.InvariantCulture)); + cmd.Parameters.AddWithValue("$id", id.ToString()); + cmd.ExecuteNonQuery(); + } + // ── SiteRuntime-024: read/write split + sync-safe Dispose ────────────── [Fact] From 6b0140dd62870dba4b9166922658485dd6bc0cba Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 10:47:25 -0400 Subject: [PATCH 06/14] fix(sitecallaudit): UpdatedAtUtc index + per-row pull resilience + UTC-convention + first-cycle test (review) --- .../Central/GrpcPullSiteCallsClient.cs | 39 +++++++++++++------ .../Grpc/SiteStreamGrpcServer.cs | 12 +++--- .../Tracking/OperationTrackingStore.cs | 20 +++++++--- .../Central/GrpcPullSiteCallsClientTests.cs | 36 +++++++++++++++++ .../SiteStreamPullSiteCallsTests.cs | 32 +++++++++++++++ 5 files changed, 118 insertions(+), 21 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs index b483b1a5..350ee1ac 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs @@ -41,8 +41,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// , , /// bare / SocketException) are caught /// and collapsed to an empty response so one offline site never sinks the rest -/// of the reconciliation tick. Any other fault (e.g. a malformed reply that -/// fails DTO mapping) is also swallowed to empty: reconciliation is best-effort. +/// of the reconciliation tick. Any other transport/protocol fault is also +/// swallowed to empty: reconciliation is best-effort. Per-row DTO mapping faults +/// (e.g. a single unparseable TrackedOperationId) are narrower still — +/// the offending row is skipped+logged and the rest of the batch is returned. /// /// /// Testability. The unary call is reached through the @@ -138,15 +140,30 @@ public sealed class GrpcPullSiteCallsClient : IPullSiteCallsClient return Empty; } - // Map proto DTOs to central SiteCall entities, re-stamp SourceSite from - // the dialed siteId (the site leaves it empty), and order oldest-first by - // UpdatedAtUtc (the wire is already ordered by the site read, but the - // contract is explicit, so sort defensively). - var siteCalls = reply.Operationals - .Select(SiteCallDtoMapper.FromDto) - .Select(sc => sc with { SourceSite = siteId }) - .OrderBy(sc => sc.UpdatedAtUtc) - .ToList(); + // Map proto DTOs to central SiteCall entities PER-ROW so one malformed + // operational (e.g. an unparseable TrackedOperationId) is skipped+logged + // rather than sinking the whole batch through the outer catch-all. Each + // survivor is re-stamped with SourceSite from the dialed siteId (the site + // leaves it empty). + var siteCalls = new List(reply.Operationals.Count); + foreach (var dto in reply.Operationals) + { + try + { + var sc = SiteCallDtoMapper.FromDto(dto) with { SourceSite = siteId }; + siteCalls.Add(sc); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "PullSiteCalls dropped a malformed operational row from site {SiteId} (id='{Id}'); continuing with the rest of the batch.", + siteId, dto.TrackedOperationId); + } + } + + // Order oldest-first by UpdatedAtUtc (the wire is already ordered by the + // site read, but the contract is explicit, so sort defensively). + siteCalls.Sort((a, b) => a.UpdatedAtUtc.CompareTo(b.UpdatedAtUtc)); return new PullSiteCallsResponse(siteCalls, reply.MoreAvailable); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs index f9d9aec0..7aedd140 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs @@ -457,7 +457,9 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // sinceUtc defaults to DateTime.MinValue when the wrapper is absent — // i.e. "pull from the beginning of recorded history", which is the // intended behaviour for the very first reconciliation cycle. - var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + var since = request.SinceUtc is not null + ? DateTime.SpecifyKind(request.SinceUtc.ToDateTime(), DateTimeKind.Utc) + : DateTime.MinValue; IReadOnlyList events; try @@ -537,10 +539,10 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // since_utc defaults to DateTime.MinValue when the wrapper is absent — // i.e. "pull from the beginning of recorded history", the intended - // behaviour for the very first reconciliation cycle. ToUniversalTime - // is safe here (the wire value is always a real UTC Timestamp, never the - // unspecified-MinValue the central client guards against on its side). - var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + // behaviour for the very first reconciliation cycle. + var since = request.SinceUtc is not null + ? DateTime.SpecifyKind(request.SinceUtc.ToDateTime(), DateTimeKind.Utc) + : DateTime.MinValue; IReadOnlyList operationals; try diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs index b3e48c54..c48964b9 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs @@ -91,6 +91,8 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, ); CREATE INDEX IF NOT EXISTS IX_OperationTracking_Status_Updated ON OperationTracking (Status, UpdatedAtUtc); + CREATE INDEX IF NOT EXISTS IX_OperationTracking_UpdatedAt + ON OperationTracking (UpdatedAtUtc); """; cmd.ExecuteNonQuery(); @@ -370,8 +372,10 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, // SiteRuntime-024: like GetStatusAsync, the reconciliation pull opens a // fresh, ungated read connection so a long-running write never blocks - // central's PullSiteCalls. The query is a bounded, ordered scan over the - // (Status, UpdatedAtUtc) index range — UpdatedAtUtc is the cursor. + // central's PullSiteCalls. The query is a bounded, ordered scan served by + // the standalone IX_OperationTracking_UpdatedAt index — UpdatedAtUtc is + // the cursor. (The composite (Status, UpdatedAtUtc) index cannot satisfy a + // status-less UpdatedAtUtc range scan; this dedicated index does.) await using var readConnection = new SqliteConnection(_connectionString); await readConnection.OpenAsync(ct).ConfigureAwait(false); @@ -390,9 +394,15 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, ORDER BY UpdatedAtUtc ASC LIMIT $batchSize; """; - cmd.Parameters.AddWithValue( - "$since", - sinceUtc.ToString("o", CultureInfo.InvariantCulture)); + // Force UTC kind before formatting so the cursor's "o" text matches the + // 'Z'-suffixed round-trip form the write path persists (DateTime.UtcNow + // .ToString("o")). A first-cycle DateTime.MinValue arrives Unspecified — + // without this its "o" rendering would lack the 'Z', and the SQLite text + // compare against 'Z'-suffixed stored values would be subtly inconsistent. + var sinceText = DateTime + .SpecifyKind(sinceUtc, DateTimeKind.Utc) + .ToString("o", CultureInfo.InvariantCulture); + cmd.Parameters.AddWithValue("$since", sinceText); cmd.Parameters.AddWithValue("$batchSize", batchSize); var rows = new List(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs index 982a9923..650b4a15 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs @@ -186,6 +186,42 @@ public class GrpcPullSiteCallsClientTests Assert.False(result.MoreAvailable); } + [Fact] + public async Task PullAsync_skips_poison_row_and_returns_the_good_rows() + { + // Poison-row resilience: one malformed operational (an unparseable + // TrackedOperationId fails SiteCallDtoMapper.FromDto → Guid.Parse) must be + // skipped+logged PER ROW rather than sinking the whole batch through the + // outer catch-all. The two good rows survive, re-stamped + oldest-first. + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + var proto = new ProtoPullResponse { MoreAvailable = false }; + proto.Operationals.Add(Dto(newer, BaseTime.AddMinutes(5))); + // Malformed row in the middle of the batch. + var bad = Dto(Guid.NewGuid(), BaseTime.AddMinutes(2)); + bad.TrackedOperationId = "not-a-guid"; + proto.Operationals.Add(bad); + proto.Operationals.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // Must NOT throw — the bad row is dropped, the good rows are returned. + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Equal(2, result.SiteCalls.Count); + // Survivors are oldest-first and SourceSite re-stamped from the dialed siteId. + Assert.Equal(older, result.SiteCalls[0].TrackedOperationId.Value); + Assert.Equal(newer, result.SiteCalls[1].TrackedOperationId.Value); + Assert.Equal("site-a", result.SiteCalls[0].SourceSite); + Assert.Equal("site-a", result.SiteCalls[1].SourceSite); + Assert.False(result.MoreAvailable); + } + [Fact] public async Task PullAsync_with_minvalue_unspecified_cursor_does_not_throw_and_dials() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs index e6b2bb99..45104a84 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs @@ -121,6 +121,38 @@ public class SiteStreamPullSiteCallsTests : TestKit Assert.Equal(since, capturedSince); } + [Fact] + public async Task PullSiteCalls_SinceUtcUnset_PassesDateTimeMinValue() + { + // First reconciliation cycle: central has no cursor yet, so the request's + // SinceUtc wrapper is absent (null). The handler must default to + // DateTime.MinValue ("pull from the beginning of recorded history") + // without a null-deref — this proves the very first cycle doesn't crash. + var store = Substitute.For(); + var captured = new DateTime(2099, 1, 1, 0, 0, 0, DateTimeKind.Utc); // sentinel + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + captured = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + // SinceUtc intentionally left unset (null) — the proto wrapper is absent. + var request = new PullSiteCallsRequest + { + BatchSize = 100, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + Assert.Equal(DateTime.MinValue, captured); + } + [Fact] public async Task PullSiteCalls_BatchSize3_Returns3Rows_MoreAvailableTrue() { From e427b38fb3794aa4f7704c13d74a95d23961b7fb Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:01:22 -0400 Subject: [PATCH 07/14] feat(sitecallaudit): periodic reconciliation pull back-fills lost telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a periodic reconciliation tick to SiteCallAuditActor that, per site, pulls changed SiteCall rows since a per-site UpdatedAtUtc cursor and upserts them idempotently (monotonic UpsertAsync) — the documented self-heal for lost best-effort gRPC telemetry. Mirrors SiteAuditReconciliationActor's structure (per-site cursor, per-site try/catch failure isolation, advance cursor by max observed UpdatedAtUtc) minus the stalled-detection EventStream machinery. Dependency wiring: add an acyclic SiteCallAudit -> AuditLog project reference and resolve IPullSiteCallsClient + ISiteEnumerator (central-only singletons registered by AddAuditLogCentralReconciliationClient) from the IServiceProvider the production ctor already holds — no Host Props.Create change needed. The repo-only test ctor injects neither collaborator, so the tick is gated off there. A new public test ctor injects fake client + enumerator + repo so the tick is unit-testable in-memory (public, not internal: Akka's ActivatorProducer uses public-only reflection binding). Options: ReconciliationInterval (default 5 min, clamped >= 1s so a zero config value can't spin the scheduler) + ReconciliationBatchSize (default 500), plus a test-only override that bypasses the clamp for millisecond cadences. Tests (all in-memory, no live MSSQL): absent row is upserted on a tick; second tick advances the cursor past already-pulled rows; one failing site does not sink other sites; repo-only ctor does not start the tick. --- .../SiteCallAuditActor.cs | 268 +++++++++++++++- .../SiteCallAuditOptions.cs | 58 +++- ...ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj | 9 + .../SiteCallAuditReconciliationTests.cs | 300 ++++++++++++++++++ 4 files changed, 623 insertions(+), 12 deletions(-) create mode 100644 tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs index 320a7227..a4a31cf2 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs @@ -1,6 +1,7 @@ using Akka.Actor; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit; @@ -24,13 +25,17 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// /// /// Implemented: direct telemetry ingest, -/// query, detail and KPI handlers (Task 4), and the central→site Retry/Discard -/// relay (Task 5 — the relay handlers live in this actor). Deferred (per -/// CLAUDE.md scope discipline — both land in a later follow-up): the periodic -/// per-site reconciliation puller that backfills lost telemetry, and the daily -/// terminal-row purge scheduler (the repository exposes -/// PurgeTerminalAsync but nothing in this module currently invokes it -/// on a schedule). +/// query, detail and KPI handlers (Task 4), the central→site Retry/Discard +/// relay (Task 5 — the relay handlers live in this actor), and the periodic +/// per-site reconciliation puller that backfills lost telemetry (Piece A — +/// , the documented self-heal pull). The +/// reconciliation timer is started in and gates on the +/// reconciliation collaborators ( + +/// ) being available — the repo-only test ctor +/// injects neither, so the timer does not run there. Deferred (next commit): +/// the daily terminal-row purge scheduler (the repository exposes +/// PurgeTerminalAsync but nothing in this module invokes it on a timer +/// yet). /// /// /// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" — @@ -68,6 +73,34 @@ public class SiteCallAuditActor : ReceiveActor private readonly SiteCallAuditOptions _options; private readonly ILogger _logger; + /// + /// Reconciliation collaborators (Piece A). The per-site self-heal pull + /// () and the site list + /// (). On the production path these are + /// resolved once from the root (central + /// singletons registered by AddAuditLogCentralReconciliationClient); + /// in the test path they are injected directly. They are null when + /// the actor was built via the repo-only test ctor — in that case the + /// reconciliation tick is NOT started (see ). + /// + private readonly IPullSiteCallsClient? _pullClient; + private readonly ISiteEnumerator? _siteEnumerator; + + /// + /// Per-site reconciliation watermark — the highest + /// seen for that site on a previous + /// tick. The next tick asks for rows at or after this cursor; idempotent + /// monotonic swallows any + /// duplicate-with-same-timestamp rows. In-memory for the singleton's + /// lifetime — a failover / restart resets every cursor to + /// , which is conservative but correct + /// (the next tick re-pulls and idempotent upsert dedupes). Mirrors + /// SiteAuditReconciliationActor. + /// + private readonly Dictionary _reconciliationCursors = new(); + + private ICancelable? _reconciliationTimer; + /// /// Task 5 (#22): the central→site command transport — the /// CentralCommunicationActor, which owns the per-site @@ -87,6 +120,11 @@ public class SiteCallAuditActor : ReceiveActor /// across every message. Used by Bundle C's MSSQL-backed TestKit fixture. /// An optional lets a test pin the stuck/KPI /// windows; when omitted the production defaults apply. + /// + /// This ctor injects NO reconciliation client/enumerator, so the + /// reconciliation tick is gated off (see ) + /// — the MSSQL-backed read/upsert tests must not fire phantom pulls. + /// /// /// Concrete repository instance to use for all messages. /// Logger for diagnostics and error reporting. @@ -106,6 +144,48 @@ public class SiteCallAuditActor : ReceiveActor RegisterHandlers(); } + /// + /// Test-mode constructor for the reconciliation tick (Piece A) — injects a + /// concrete repository PLUS the two reconciliation collaborators directly, + /// so the per-site self-heal pull is unit-testable in-memory without a DI + /// container or a live gRPC channel. Because the client + enumerator are + /// present, the reconciliation tick IS started (it gates on the + /// collaborators being available — see ). + /// + /// Concrete repository instance used for upserts and purges. + /// Enumerates the sites to reconcile each tick. + /// Pull client used to fetch changed rows from each site. + /// Logger for diagnostics and error reporting. + /// Optional configuration overrides; production defaults apply when null. + /// + /// Public (not internal) because Akka's default ActivatorProducer + /// instantiates the actor via reflection with public-only binding flags — + /// an internal ctor yields a MissingMethodException at actor + /// creation. Distinguished from the production + /// ctor by its concrete-collaborator parameter list; only the test project + /// (or a host that hand-resolves the collaborators) constructs it this way. + /// + public SiteCallAuditActor( + ISiteCallAuditRepository repository, + ISiteEnumerator siteEnumerator, + IPullSiteCallsClient pullClient, + ILogger logger, + SiteCallAuditOptions? options = null) + { + ArgumentNullException.ThrowIfNull(repository); + ArgumentNullException.ThrowIfNull(siteEnumerator); + ArgumentNullException.ThrowIfNull(pullClient); + ArgumentNullException.ThrowIfNull(logger); + + _injectedRepository = repository; + _siteEnumerator = siteEnumerator; + _pullClient = pullClient; + _logger = logger; + _options = options ?? new SiteCallAuditOptions(); + + RegisterHandlers(); + } + /// /// Production constructor — resolves /// from a fresh DI scope per message because the repository is a scoped EF @@ -129,6 +209,17 @@ public class SiteCallAuditActor : ReceiveActor _options = options; _logger = logger; + // Reconciliation collaborators (Piece A) are central-only singletons + // registered by AddAuditLogCentralReconciliationClient — always on the + // central composition root (Program.cs). Resolve them once here (the + // actor itself is a long-lived singleton; the repository is the only + // scoped service and is still resolved per-tick/per-message). GetService + // (not GetRequiredService) so a host that somehow omits the helper + // degrades to "no reconciliation tick" rather than a startup crash — + // the tick startup gates on both being non-null. + _pullClient = serviceProvider.GetService(); + _siteEnumerator = serviceProvider.GetService(); + RegisterHandlers(); } @@ -154,6 +245,49 @@ public class SiteCallAuditActor : ReceiveActor }); Receive(HandleRetrySiteCall); Receive(HandleDiscardSiteCall); + + // Piece A (#22): self-tick for the periodic reconciliation pull. The + // handler stays alive across faults via its own per-site try/catch + // (mirroring the ingest path); the timer is only started when the + // reconciliation collaborators are available. + ReceiveAsync(_ => OnReconciliationTickAsync()); + } + + /// + protected override void PreStart() + { + base.PreStart(); + StartReconciliationTimer(); + } + + /// + protected override void PostStop() + { + _reconciliationTimer?.Cancel(); + base.PostStop(); + } + + /// + /// Starts the periodic reconciliation tick — but ONLY when both the pull + /// client and the site enumerator are available. The repo-only test ctor + /// injects neither, so the tick is gated off there (the MSSQL read/upsert + /// tests must not fire phantom pulls); the reconciliation test ctor and the + /// production ctor (which resolves both from the SP) start it. + /// + private void StartReconciliationTimer() + { + if (_pullClient is null || _siteEnumerator is null) + { + return; + } + + var interval = _options.ResolvedReconciliationInterval; + _reconciliationTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: ReconciliationTick.Instance, + sender: Self); } /// @@ -212,6 +346,119 @@ public class SiteCallAuditActor : ReceiveActor } } + // ── Piece A: periodic per-site reconciliation pull (self-heal) ── + + /// + /// One reconciliation pass: enumerate every known site and, per site, pull + /// changed rows since that site's cursor and upsert + /// them idempotently — the documented self-heal when best-effort gRPC push + /// telemetry is lost. This is a mirror, NOT a dispatcher: cached-call + /// delivery stays site-local; upserting reconciled rows only refreshes the + /// eventually-consistent central SiteCalls mirror. + /// + /// + /// Mirrors SiteAuditReconciliationActor's structure (per-site cursor, + /// per-site try/catch failure isolation, advance the cursor by the max + /// observed ) but is deliberately simpler: + /// no stalled-detection EventStream machinery — just cursor + pull + upsert + /// + advance. One DI scope per tick is opened and the same repository reused + /// across every site in that tick. + /// + private async Task OnReconciliationTickAsync() + { + // The collaborators are guaranteed non-null: the tick is only scheduled + // when both are present (StartReconciliationTimer). Assert via the + // local copies so a future refactor that drops the gate fails loudly. + var enumerator = _siteEnumerator!; + var client = _pullClient!; + + IReadOnlyList sites; + try + { + // No ambient CancellationToken in a ReceiveActor handler — None is + // intentional; the work is bounded by the reconciliation interval + // plus the singleton's graceful-stop drain on PhaseClusterLeave. + sites = await enumerator.EnumerateAsync().ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError(ex, "SiteCallAudit site enumeration failed; skipping reconciliation tick."); + return; + } + + if (sites.Count == 0) + { + return; + } + + var (scope, repository) = ResolveRepository(); + try + { + foreach (var site in sites) + { + try + { + await ReconcileSiteAsync(site, client, repository).ConfigureAwait(false); + } + catch (Exception ex) + { + // Failure-isolation invariant: one site's fault (transport, + // repository write) must NOT sink the rest of the tick. The + // failing site's cursor is left at its previous value so the + // next tick retries the same window. + _logger.LogWarning( + ex, + "SiteCallAudit reconciliation pull failed for site {SiteId}; other sites continue.", + site.SiteId); + } + } + } + finally + { + scope?.Dispose(); + } + } + + /// + /// Issues one PullSiteCalls RPC against the site, upserts the + /// returned rows idempotently, and advances the site's cursor to the maximum + /// observed. The pull client returns rows + /// oldest-first with SourceSite already re-stamped from the dialed + /// site id, so the actor upserts them verbatim (re-stamping + /// IngestedAtUtc at central persist time, as the telemetry path does). + /// + private async Task ReconcileSiteAsync( + SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository) + { + var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; + var response = await client + .PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None) + .ConfigureAwait(false); + + var maxUpdated = since; + var nowUtc = DateTime.UtcNow; + foreach (var row in response.SiteCalls) + { + // IngestedAtUtc is the "central ingested (or last refreshed) this + // row" stamp — owned by the central actor, exactly as OnUpsertAsync + // does for the telemetry path. Monotonic UpsertAsync makes a row + // already present (from a prior push) a silent no-op. + var siteCall = row with { IngestedAtUtc = nowUtc }; + await repository.UpsertAsync(siteCall).ConfigureAwait(false); + + if (row.UpdatedAtUtc > maxUpdated) + { + maxUpdated = row.UpdatedAtUtc; + } + } + + // Advance the cursor to the newest row seen. A MoreAvailable response + // means the site saturated the batch; the next tick continues draining + // from the advanced cursor (no immediate re-pull loop — the natural + // tick cadence drains the backlog, matching SiteAuditReconciliationActor). + _reconciliationCursors[site.SiteId] = maxUpdated; + } + // ── Task 4: read-side (query / detail / KPI) ── /// @@ -693,6 +940,13 @@ public class SiteCallAuditActor : ReceiveActor { return string.IsNullOrWhiteSpace(value) ? null : value; } + + /// Self-tick triggering a reconciliation pass across all sites (Piece A). + internal sealed class ReconciliationTick + { + public static readonly ReconciliationTick Instance = new(); + private ReconciliationTick() { } + } } /// diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs index a5db3102..134e606e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs @@ -1,11 +1,11 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// -/// Configuration options for the Site Call Audit (#22) read-side: stuck-call -/// detection and KPI windowing. Mirrors the KPI-relevant subset of -/// NotificationOutboxOptions — the reconciliation, purge and dispatch -/// cadence options the Notification Outbox carries are not part of the Site -/// Call Audit read-side backend and are deliberately omitted here. +/// Configuration options for the Site Call Audit (#22): stuck-call detection + +/// KPI windowing for the read-side, plus the cadence knobs for the periodic +/// per-site reconciliation pull (self-heal for lost telemetry). Mirrors the +/// KPI-relevant subset of NotificationOutboxOptions and the +/// scheduler-cadence shape of SiteAuditReconciliationOptions. /// public class SiteCallAuditOptions { @@ -44,4 +44,52 @@ public class SiteCallAuditOptions /// /// public TimeSpan RelayTimeout { get; set; } = TimeSpan.FromSeconds(10); + + // ── Reconciliation tick (#22): periodic per-site self-heal pull ── + + /// + /// Period of the reconciliation tick. Each tick visits every known site + /// once, pulls changed SiteCall rows since a per-site cursor, and + /// upserts them idempotently — the documented self-heal when best-effort + /// push telemetry is lost. Default 5 minutes, matching the sibling + /// SiteAuditReconciliationOptions (#23) cadence. Clamped to at least + /// via . + /// + public TimeSpan ReconciliationInterval { get; set; } = TimeSpan.FromMinutes(5); + + /// + /// Test-only override for the reconciliation tick cadence — bypasses the + /// clamp so unit tests can drop the + /// cadence to milliseconds. Production config never sets this; leave null. + /// + public TimeSpan? ReconciliationIntervalOverride { get; set; } + + /// + /// Maximum number of SiteCall rows requested per PullSiteCalls + /// RPC. Default 500. A MoreAvailable=true response signals the cursor + /// advanced and the next tick should keep draining the backlog. + /// + public int ReconciliationBatchSize { get; set; } = 500; + + /// + /// Minimum interval the config-bound can + /// resolve to. Clamps a misconfigured 0 (or negative) value away from + /// , which would make Akka's + /// ScheduleTellRepeatedlyCancelable spin — the exact footgun flagged in + /// a prior review of the sibling reconciliation options. + /// + private static readonly TimeSpan MinReconciliationInterval = TimeSpan.FromSeconds(1); + + /// + /// Resolves the effective reconciliation tick interval: the test override + /// when set (bypassing the clamp), otherwise + /// clamped to at least so a + /// zero/negative config value can never yield . + /// + public TimeSpan ResolvedReconciliationInterval => + ReconciliationIntervalOverride is { } o + ? o + : ReconciliationInterval < MinReconciliationInterval + ? MinReconciliationInterval + : ReconciliationInterval; } diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj index cca4f3eb..c2de6728 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj @@ -29,6 +29,15 @@ the same transport every other central→site command uses. SiteEnvelope is defined in ZB.MOM.WW.ScadaBridge.Communication (no cycle: Communication does not reference SiteCallAudit). --> + + diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs new file mode 100644 index 00000000..ac2f86b0 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs @@ -0,0 +1,300 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; + +namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests; + +/// +/// Reconciliation-tick tests for (#22, Piece A). +/// These exercise the periodic per-site self-heal pull entirely in-memory — +/// fake + + a +/// recording — so they run in +/// milliseconds and do NOT depend on a live MSSQL fixture (unlike the +/// MSSQL-backed ). The actor is built via +/// the internal test ctor that injects all three collaborators; the +/// repo-only test ctor used by the MSSQL tests passes no client/enumerator, so +/// the reconciliation tick is gated off there (see +/// ). +/// +public class SiteCallAuditReconciliationTests : TestKit +{ + private static SiteCall NewRow( + TrackedOperationId id, + string sourceSite, + string status = "Submitted", + DateTime? updatedAtUtc = null) + { + var now = updatedAtUtc ?? DateTime.UtcNow; + return new SiteCall + { + TrackedOperationId = id, + Channel = "ApiOutbound", + Target = "ERP.GetOrder", + SourceSite = sourceSite, + SourceNode = null, + Status = status, + RetryCount = 0, + LastError = null, + HttpStatus = null, + CreatedAtUtc = now, + UpdatedAtUtc = now, + TerminalAtUtc = null, + IngestedAtUtc = now, + }; + } + + private static SiteCallAuditOptions FastTickOptions(int batchSize = 500) => new() + { + // 100 ms tick keeps each test under a second; AwaitAssert covers + // scheduler jitter so the tick has up to a few seconds to fire. + ReconciliationInterval = TimeSpan.FromMinutes(5), + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + ReconciliationBatchSize = batchSize, + }; + + /// In-memory enumerator returning a static list of sites. + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Scripted pull client — returns the next queued response for the site on + /// each call (looping the last entry once exhausted) and records every + /// invocation so tests can assert call counts + the since cursor. + /// + private sealed class ScriptedPullClient : IPullSiteCallsClient + { + public List<(string SiteId, DateTime SinceUtc, int BatchSize)> Calls { get; } = new(); + private readonly Dictionary> _scripted = new(); + private readonly Dictionary _throwOnSite = new(); + + public ScriptedPullClient Script(string siteId, params PullSiteCallsResponse[] responses) + { + _scripted[siteId] = new Queue(responses); + return this; + } + + public ScriptedPullClient ThrowFor(string siteId, Exception ex) + { + _throwOnSite[siteId] = ex; + return this; + } + + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + Calls.Add((siteId, sinceUtc, batchSize)); + if (_throwOnSite.TryGetValue(siteId, out var ex)) + { + throw ex; + } + if (_scripted.TryGetValue(siteId, out var queue) && queue.Count > 0) + { + return Task.FromResult(queue.Dequeue()); + } + return Task.FromResult( + new PullSiteCallsResponse(Array.Empty(), MoreAvailable: false)); + } + } + + /// + /// Recording repository that captures every call + /// (keyed by id, last-write-wins on the captured row). The reconciliation + /// tick only ever calls ; the read/KPI members are + /// inert stubs. + /// + private sealed class RecordingRepo : ISiteCallAuditRepository + { + public Dictionary Upserted { get; } = new(); + public int UpsertCallCount { get; private set; } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) + { + UpsertCallCount++; + Upserted[siteCall.TrackedOperationId] = siteCall; + return Task.CompletedTask; + } + + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => + Task.FromResult(Upserted.TryGetValue(id, out var row) ? row : null); + + public Task> QueryAsync( + SiteCallQueryFilter filter, SiteCallPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) => + Task.FromResult(0); + + public Task ComputeKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + + public Task> ComputePerSiteKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + private IActorRef CreateActor( + ISiteEnumerator sites, + IPullSiteCallsClient client, + ISiteCallAuditRepository repo, + SiteCallAuditOptions options) => + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + sites, + client, + NullLogger.Instance, + options))); + + // --------------------------------------------------------------------- + // 1. AbsentRow_PulledFromSite_IsUpserted + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_AbsentRow_IsUpsertedFromSitePull() + { + var siteId = "siteA"; + var id = TrackedOperationId.New(); + var row = NewRow(id, sourceSite: siteId, status: "Parked"); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + var client = new ScriptedPullClient().Script(siteId, + new PullSiteCallsResponse(new[] { row }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => + { + Assert.True(repo.Upserted.ContainsKey(id), + "reconciliation tick should upsert the row present at the site but absent centrally"); + Assert.Equal("Parked", repo.Upserted[id].Status); + Assert.Equal(siteId, repo.Upserted[id].SourceSite); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Cursor_Advances_ToMaxUpdatedAtUtc_NoRePullOfOldRows + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_SecondTick_AdvancesCursorPastAlreadyPulledRows() + { + var siteId = "siteA"; + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + var r1 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t1); + var r2 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t2); + var r3 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t3); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + // First pull returns three rows (max UpdatedAtUtc = t3); subsequent + // pulls return empty. The second pull's `since` must be t3, proving the + // cursor advanced and old rows are not re-pulled from the start. + var client = new ScriptedPullClient().Script(siteId, + new PullSiteCallsResponse(new[] { r1, r2, r3 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => Assert.True(client.Calls.Count >= 2, + $"need at least 2 pulls to assert cursor advancement, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(5), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Equal(DateTime.MinValue, client.Calls[0].SinceUtc); + Assert.Equal(t3, client.Calls[1].SinceUtc); + // The batch size flows through from options. + Assert.Equal(500, client.Calls[0].BatchSize); + } + + // --------------------------------------------------------------------- + // 3. OneSiteThrows_OtherSitesStillProcessed (failure isolation) + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_OneSiteThrows_OtherSitesStillReconciled() + { + var siteB = "siteB"; + var bId = TrackedOperationId.New(); + var bRow = NewRow(bId, sourceSite: siteB, status: "Delivered"); + + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry(siteB, "http://siteB:8083")); + var client = new ScriptedPullClient() + .ThrowFor("siteA", new InvalidOperationException("simulated transport failure")) + .Script(siteB, new PullSiteCallsResponse(new[] { bRow }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => + { + // siteA was attempted (and threw) yet siteB's row still landed — + // one offline site must not sink the rest of the tick. + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.True(repo.Upserted.ContainsKey(bId), + "siteB must be reconciled even though siteA threw"); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 4. RepoOnly test ctor does NOT start the reconciliation tick + // --------------------------------------------------------------------- + + [Fact] + public void TestCtor_RepositoryOnly_DoesNotStartReconciliationTick() + { + // The repo-only test ctor (used by the MSSQL-backed actor tests) injects + // no client/enumerator, so the tick must be gated OFF — otherwise those + // tests would fire phantom pulls. Build the actor via that ctor and + // confirm no pull ever happens. We can't observe a non-event directly, + // so we share a ScriptedPullClient with an isolated actor that DOES run + // the tick to bound the wait, then assert the repo-only actor's client + // (a separate instance) recorded nothing. + var repo = new RecordingRepo(); + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + NullLogger.Instance, + FastTickOptions()))); + + // Run a parallel actor with the full reconciliation ctor and a fast + // tick; once IT has pulled we know enough wall-clock elapsed that the + // repo-only actor would have ticked too, had it been wired. + var liveClient = new ScriptedPullClient(); + var liveRepo = new RecordingRepo(); + CreateActor( + new StaticEnumerator(new SiteEntry("siteX", "http://siteX:8083")), + liveClient, + liveRepo, + FastTickOptions()); + + AwaitAssert( + () => Assert.True(liveClient.Calls.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // The repo-only actor never reconciles: it has no client to pull with, + // so it upserts nothing on its own. + Assert.Equal(0, repo.UpsertCallCount); + } +} From e675b3450092b642f119b6b4faa02f56f4b30344 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:03:49 -0400 Subject: [PATCH 08/14] feat(sitecallaudit): daily terminal-row purge scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a daily purge tick to SiteCallAuditActor that drops terminal SiteCalls rows older than the retention window via ISiteCallAuditRepository.PurgeTerminalAsync. The threshold is computed each tick as UtcNow - RetentionDays so an operator who lowers RetentionDays sees it on the next purge without a restart. Mirrors AuditLogPurgeActor's daily cadence + continue-on-error posture: a purge fault is logged and swallowed so the central singleton stays alive and retries next tick. The purge timer is started in PreStart alongside the reconciliation timer and gates on the same collaborators (pull client + enumerator) being available — the repo-only test ctor injects neither, so neither background timer runs there. Options: PurgeInterval (default 24h, clamped >= 1 min so a zero config value can't spin the scheduler) + RetentionDays (default 365), plus a test-only override that bypasses the clamp for millisecond cadences. Tests (all in-memory, no live MSSQL): purge tick calls PurgeTerminalAsync with a UtcNow - RetentionDays threshold (non-default 30 days); default retention yields a 365-day threshold; a throwing repo does not kill the singleton (a second tick still arrives). --- .../SiteCallAuditActor.cs | 109 +++++++++-- .../SiteCallAuditOptions.cs | 57 +++++- .../SiteCallAuditPurgeTests.cs | 175 ++++++++++++++++++ 3 files changed, 323 insertions(+), 18 deletions(-) create mode 100644 tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs index a4a31cf2..20f11316 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs @@ -26,16 +26,16 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// /// Implemented: direct telemetry ingest, /// query, detail and KPI handlers (Task 4), the central→site Retry/Discard -/// relay (Task 5 — the relay handlers live in this actor), and the periodic +/// relay (Task 5 — the relay handlers live in this actor), the periodic /// per-site reconciliation puller that backfills lost telemetry (Piece A — -/// , the documented self-heal pull). The -/// reconciliation timer is started in and gates on the +/// , the documented self-heal pull), and +/// the daily terminal-row purge scheduler (Piece B — +/// , which invokes +/// on a timer). Both +/// background timers are started in and gate on the /// reconciliation collaborators ( + /// ) being available — the repo-only test ctor -/// injects neither, so the timer does not run there. Deferred (next commit): -/// the daily terminal-row purge scheduler (the repository exposes -/// PurgeTerminalAsync but nothing in this module invokes it on a timer -/// yet). +/// injects neither, so neither timer runs there. /// /// /// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" — @@ -81,7 +81,8 @@ public class SiteCallAuditActor : ReceiveActor /// singletons registered by AddAuditLogCentralReconciliationClient); /// in the test path they are injected directly. They are null when /// the actor was built via the repo-only test ctor — in that case the - /// reconciliation tick is NOT started (see ). + /// reconciliation tick is NOT started (see ); + /// the purge tick gates on the same collaborators (see ). /// private readonly IPullSiteCallsClient? _pullClient; private readonly ISiteEnumerator? _siteEnumerator; @@ -100,6 +101,7 @@ public class SiteCallAuditActor : ReceiveActor private readonly Dictionary _reconciliationCursors = new(); private ICancelable? _reconciliationTimer; + private ICancelable? _purgeTimer; /// /// Task 5 (#22): the central→site command transport — the @@ -149,8 +151,9 @@ public class SiteCallAuditActor : ReceiveActor /// concrete repository PLUS the two reconciliation collaborators directly, /// so the per-site self-heal pull is unit-testable in-memory without a DI /// container or a live gRPC channel. Because the client + enumerator are - /// present, the reconciliation tick IS started (it gates on the - /// collaborators being available — see ). + /// present, the reconciliation tick IS started; the purge tick is also + /// started (both gate on the collaborators being available — see + /// / ). /// /// Concrete repository instance used for upserts and purges. /// Enumerates the sites to reconcile each tick. @@ -246,11 +249,12 @@ public class SiteCallAuditActor : ReceiveActor Receive(HandleRetrySiteCall); Receive(HandleDiscardSiteCall); - // Piece A (#22): self-tick for the periodic reconciliation pull. The - // handler stays alive across faults via its own per-site try/catch - // (mirroring the ingest path); the timer is only started when the - // reconciliation collaborators are available. + // Piece A/B (#22): self-ticks for the periodic reconciliation pull and + // the daily terminal-row purge. Handlers stay alive across faults via + // their own per-site / per-tick try/catch (mirroring the ingest path); + // the timers are only started when their collaborators are available. ReceiveAsync(_ => OnReconciliationTickAsync()); + ReceiveAsync(_ => OnPurgeTickAsync()); } /// @@ -258,12 +262,14 @@ public class SiteCallAuditActor : ReceiveActor { base.PreStart(); StartReconciliationTimer(); + StartPurgeTimer(); } /// protected override void PostStop() { _reconciliationTimer?.Cancel(); + _purgeTimer?.Cancel(); base.PostStop(); } @@ -290,6 +296,29 @@ public class SiteCallAuditActor : ReceiveActor sender: Self); } + /// + /// Starts the daily purge tick — gated on the same collaborator presence as + /// the reconciliation tick. The purge itself only needs the repository, but + /// gating both schedulers together keeps the repo-only test ctor (no + /// client/enumerator) free of BOTH background timers, so the MSSQL read/ + /// upsert tests see no scheduled side effects. + /// + private void StartPurgeTimer() + { + if (_pullClient is null || _siteEnumerator is null) + { + return; + } + + var interval = _options.ResolvedPurgeInterval; + _purgeTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: PurgeTick.Instance, + sender: Self); + } + /// protected override SupervisorStrategy SupervisorStrategy() { @@ -459,6 +488,51 @@ public class SiteCallAuditActor : ReceiveActor _reconciliationCursors[site.SiteId] = maxUpdated; } + // ── Piece B: daily terminal-row purge scheduler ── + + /// + /// One purge pass: drops terminal SiteCalls rows whose + /// is older than + /// UtcNow - RetentionDays via + /// . Non-terminal + /// rows are never purged (enforced in the repository). The threshold is + /// computed each tick so an operator who lowers RetentionDays sees it + /// applied on the next purge without an actor restart. Mirrors + /// AuditLogPurgeActor's daily cadence + continue-on-error posture: a + /// purge fault is logged and swallowed so the singleton stays alive. + /// + private async Task OnPurgeTickAsync() + { + var threshold = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays); + + var (scope, repository) = ResolveRepository(); + try + { + var rowsDeleted = await repository.PurgeTerminalAsync(threshold).ConfigureAwait(false); + if (rowsDeleted > 0) + { + _logger.LogInformation( + "SiteCallAudit purged {RowsDeleted} terminal SiteCalls rows older than {ThresholdUtc:o}.", + rowsDeleted, + threshold); + } + } + catch (Exception ex) + { + // Continue-on-error: a purge fault (transient SQL failure, + // contention) must NOT crash the central singleton. The next tick + // retries the same window. + _logger.LogError( + ex, + "SiteCallAudit terminal-row purge failed (threshold {ThresholdUtc:o}); will retry next tick.", + threshold); + } + finally + { + scope?.Dispose(); + } + } + // ── Task 4: read-side (query / detail / KPI) ── /// @@ -947,6 +1021,13 @@ public class SiteCallAuditActor : ReceiveActor public static readonly ReconciliationTick Instance = new(); private ReconciliationTick() { } } + + /// Self-tick triggering a terminal-row purge pass (Piece B). + internal sealed class PurgeTick + { + public static readonly PurgeTick Instance = new(); + private PurgeTick() { } + } } /// diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs index 134e606e..317b29f9 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs @@ -2,10 +2,12 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// /// Configuration options for the Site Call Audit (#22): stuck-call detection + -/// KPI windowing for the read-side, plus the cadence knobs for the periodic -/// per-site reconciliation pull (self-heal for lost telemetry). Mirrors the -/// KPI-relevant subset of NotificationOutboxOptions and the -/// scheduler-cadence shape of SiteAuditReconciliationOptions. +/// KPI windowing for the read-side, plus the cadence/retention knobs for the +/// two central-singleton schedulers — the periodic per-site reconciliation +/// pull (self-heal for lost telemetry) and the daily terminal-row purge. +/// Mirrors the KPI-relevant subset of NotificationOutboxOptions and the +/// scheduler-cadence shape of SiteAuditReconciliationOptions / +/// AuditLogPurgeOptions. /// public class SiteCallAuditOptions { @@ -92,4 +94,51 @@ public class SiteCallAuditOptions : ReconciliationInterval < MinReconciliationInterval ? MinReconciliationInterval : ReconciliationInterval; + + // ── Purge scheduler (#22): daily terminal-row purge ── + + /// + /// Period of the purge tick. Each tick drops terminal SiteCalls rows + /// older than the retention window via + /// . + /// Default 24 hours, matching AuditLogPurgeOptions. Clamped to at + /// least via . + /// + public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24); + + /// + /// Test-only override for the purge tick cadence — bypasses the + /// clamp so unit tests can drop the cadence + /// to milliseconds. Production config never sets this; leave null. + /// + public TimeSpan? PurgeIntervalOverride { get; set; } + + /// + /// Retention window for terminal rows. On each purge tick a row whose + /// TerminalAtUtc is older than UtcNow - RetentionDays is + /// deleted; non-terminal rows are never purged. Default 365 days, matching + /// the central audit-store retention policy. + /// + public int RetentionDays { get; set; } = 365; + + /// + /// Minimum interval the config-bound can resolve + /// to. Clamps a misconfigured 0 (or negative) value away from + /// for the same scheduler-spin reason as + /// ; the purge is daily so the floor + /// is a more generous 1 minute. + /// + private static readonly TimeSpan MinPurgeInterval = TimeSpan.FromMinutes(1); + + /// + /// Resolves the effective purge tick interval: the test override when set + /// (bypassing the clamp), otherwise clamped to at + /// least . + /// + public TimeSpan ResolvedPurgeInterval => + PurgeIntervalOverride is { } o + ? o + : PurgeInterval < MinPurgeInterval + ? MinPurgeInterval + : PurgeInterval; } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs new file mode 100644 index 00000000..6352ddec --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs @@ -0,0 +1,175 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; + +namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests; + +/// +/// Purge-scheduler tests for (#22, Piece B). +/// Exercises the daily terminal-row purge tick in-memory — a recording +/// captures the +/// threshold the actor +/// computes, with no live MSSQL fixture. The reconciliation collaborators are +/// inert stubs (the purge tick doesn't use them, but they must be present to +/// arm the scheduler — both timers gate on the collaborators together). +/// +public class SiteCallAuditPurgeTests : TestKit +{ + private static SiteCallAuditOptions FastPurgeOptions(int retentionDays = 365) => new() + { + // Keep the reconciliation tick slow so it doesn't fight the purge tick + // for the test window; drop the purge tick to 100 ms via its override. + ReconciliationIntervalOverride = TimeSpan.FromMinutes(5), + PurgeIntervalOverride = TimeSpan.FromMilliseconds(100), + RetentionDays = retentionDays, + }; + + /// Empty enumerator — the purge path never touches it, but it must be present to arm the scheduler. + private sealed class EmptyEnumerator : ISiteEnumerator + { + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// No-op pull client — present only to arm the scheduler. + private sealed class NoOpPullClient : IPullSiteCallsClient + { + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) => + Task.FromResult(new PullSiteCallsResponse(Array.Empty(), MoreAvailable: false)); + } + + /// + /// Recording repository capturing every + /// threshold (and the configured deleted-row count it returns). + /// + private sealed class RecordingRepo : ISiteCallAuditRepository + { + public List PurgeThresholds { get; } = new(); + public int RowsDeletedPerCall { get; set; } + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) + { + PurgeThresholds.Add(olderThanUtc); + return Task.FromResult(RowsDeletedPerCall); + } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) => Task.CompletedTask; + + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => + Task.FromResult(null); + + public Task> QueryAsync( + SiteCallQueryFilter filter, SiteCallPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task ComputeKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + + public Task> ComputePerSiteKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// Repository whose purge always throws — to prove continue-on-error keeps the singleton alive. + private sealed class PurgeThrowingRepo : ISiteCallAuditRepository + { + public int PurgeCallCount; + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) + { + Interlocked.Increment(ref PurgeCallCount); + throw new InvalidOperationException("simulated purge failure"); + } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) => Task.CompletedTask; + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => Task.FromResult(null); + public Task> QueryAsync(SiteCallQueryFilter f, SiteCallPaging p, CancellationToken ct = default) => Task.FromResult>(Array.Empty()); + public Task ComputeKpisAsync(DateTime a, DateTime b, CancellationToken ct = default) => Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + public Task> ComputePerSiteKpisAsync(DateTime a, DateTime b, CancellationToken ct = default) => Task.FromResult>(Array.Empty()); + } + + private IActorRef CreateActor(ISiteCallAuditRepository repo, SiteCallAuditOptions options) => + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + new EmptyEnumerator(), + new NoOpPullClient(), + NullLogger.Instance, + options))); + + // --------------------------------------------------------------------- + // 1. PurgeTick_CallsPurgeTerminal_WithRetentionThreshold + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_CallsPurgeTerminalAsync_WithRetentionThreshold() + { + var repo = new RecordingRepo { RowsDeletedPerCall = 7 }; + // Non-default retention (30 days) so the assertion isn't accidentally + // satisfied by the 365-day default. + CreateActor(repo, FastPurgeOptions(retentionDays: 30)); + + AwaitAssert( + () => Assert.True(repo.PurgeThresholds.Count >= 1, + $"expected >= 1 PurgeTerminalAsync call, got {repo.PurgeThresholds.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // The threshold the actor passed must be ~UtcNow - 30 days. 1-minute + // slack covers scheduling jitter between the tick firing and the assert. + var threshold = repo.PurgeThresholds[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(30); + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"purge threshold {threshold:o} should be within 1 minute of {expected:o}"); + } + + // --------------------------------------------------------------------- + // 2. PurgeTick_UsesDefaultRetention_365Days + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_DefaultRetention_Uses365DayThreshold() + { + var repo = new RecordingRepo(); + CreateActor(repo, FastPurgeOptions()); // default 365 days + + AwaitAssert( + () => Assert.True(repo.PurgeThresholds.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + var threshold = repo.PurgeThresholds[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(365); + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"purge threshold {threshold:o} should be within 1 minute of {expected:o}"); + } + + // --------------------------------------------------------------------- + // 3. PurgeTick_RepoThrows_ActorStaysAlive_RetriesNextTick (continue-on-error) + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_PurgeThrows_ActorStaysAlive_RetriesNextTick() + { + var repo = new PurgeThrowingRepo(); + CreateActor(repo, FastPurgeOptions()); + + // The singleton must NOT die on a purge fault — a second tick must still + // arrive (continue-on-error). Two purge calls prove the actor survived + // the first throw and the timer kept ticking. + AwaitAssert( + () => Assert.True(repo.PurgeCallCount >= 2, + $"expected >= 2 purge attempts (actor survived the throw), got {repo.PurgeCallCount}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } +} From f49ac51771ed9a33554b4859efaf5b16ab59325f Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:10:54 -0400 Subject: [PATCH 09/14] fix(sitecallaudit): async DI scope in tick paths + options clamp tests + cursor/retry docs (review) --- .../SiteCallAuditActor.cs | 118 ++++++++++++++---- .../SiteCallAuditOptionsTests.cs | 103 +++++++++++++++ 2 files changed, 194 insertions(+), 27 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs index 20f11316..b89ae01a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs @@ -420,31 +420,51 @@ public class SiteCallAuditActor : ReceiveActor return; } - var (scope, repository) = ResolveRepository(); - try + // AuditLog-003: open the scope INLINE with CreateAsyncScope + await using + // so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes + // asynchronously at end of tick rather than blocking the Akka dispatcher + // thread on a synchronous Dispose() of pending connection cleanup — the tick + // holds the scope across many awaited UpsertAsync calls. Mirrors the sibling + // SiteAuditReconciliationActor.OnTickAsync. ResolveRepository() (sync Dispose) + // is retained for the synchronous message-handler paths. In the injected- + // repository test path there is no scope to open and the test repo is reused. + if (_injectedRepository is not null) { - foreach (var site in sites) - { - try - { - await ReconcileSiteAsync(site, client, repository).ConfigureAwait(false); - } - catch (Exception ex) - { - // Failure-isolation invariant: one site's fault (transport, - // repository write) must NOT sink the rest of the tick. The - // failing site's cursor is left at its previous value so the - // next tick retries the same window. - _logger.LogWarning( - ex, - "SiteCallAudit reconciliation pull failed for site {SiteId}; other sites continue.", - site.SiteId); - } - } + await ReconcileSitesAsync(sites, client, _injectedRepository).ConfigureAwait(false); + return; } - finally + + await using var scope = _serviceProvider!.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + await ReconcileSitesAsync(sites, client, repository).ConfigureAwait(false); + } + + /// + /// Reconciles every site in the tick against a single resolved repository, + /// isolating per-site faults so one bad site never sinks the rest of the + /// pass (the failing site's cursor is left at its previous value so the next + /// tick retries the same window). + /// + private async Task ReconcileSitesAsync( + IReadOnlyList sites, IPullSiteCallsClient client, ISiteCallAuditRepository repository) + { + foreach (var site in sites) { - scope?.Dispose(); + try + { + await ReconcileSiteAsync(site, client, repository).ConfigureAwait(false); + } + catch (Exception ex) + { + // Failure-isolation invariant: one site's fault (transport, + // repository write) must NOT sink the rest of the tick. The + // failing site's cursor is left at its previous value so the + // next tick retries the same window. + _logger.LogWarning( + ex, + "SiteCallAudit reconciliation pull failed for site {SiteId}; other sites continue.", + site.SiteId); + } } } @@ -456,6 +476,31 @@ public class SiteCallAuditActor : ReceiveActor /// site id, so the actor upserts them verbatim (re-stamping /// IngestedAtUtc at central persist time, as the telemetry path does). /// + /// + /// + /// Coarse per-site retry — a deliberate divergence from + /// SiteAuditReconciliationActor. That sibling (AuditLog-004) tracks + /// a per-EventId attempt counter and permanently abandons a row after a + /// threshold so a single un-insertable row cannot block a site's cursor + /// forever. This actor deliberately does NOT: any throw inside the loop + /// propagates to 's per-site catch, + /// which leaves the site's cursor at its previous value, so the next tick + /// re-pulls the whole batch from since. A persistently-bad row therefore + /// holds the site's cursor and re-pulls the batch every tick. This is + /// acceptable here because is + /// monotonic and idempotent — re-pulling already-ingested rows is a cheap + /// no-op — and the SiteCalls table is an eventually-consistent mirror, + /// not the source of truth, so a slow site simply lags rather than corrupts. + /// + /// + /// Inclusive cursor boundary. The cursor is advanced to the maximum + /// seen, and the pull asks for rows at or + /// after it (since is >=, not >). The row whose + /// timestamp equals the cursor is therefore re-pulled on the next tick and + /// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary + /// contract as SiteAuditReconciliationActor's cursor. + /// + /// private async Task ReconcileSiteAsync( SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository) { @@ -505,7 +550,30 @@ public class SiteCallAuditActor : ReceiveActor { var threshold = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays); - var (scope, repository) = ResolveRepository(); + // AuditLog-003: open the scope INLINE with CreateAsyncScope + await using + // so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes + // asynchronously rather than blocking the Akka dispatcher thread on a + // synchronous Dispose(). Mirrors SiteAuditReconciliationActor; the + // injected-repository test path reuses the test repo with no scope. + if (_injectedRepository is not null) + { + await PurgeWithRepositoryAsync(_injectedRepository, threshold).ConfigureAwait(false); + return; + } + + await using var scope = _serviceProvider!.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + await PurgeWithRepositoryAsync(repository, threshold).ConfigureAwait(false); + } + + /// + /// Runs one terminal-row purge against the resolved repository, logging and + /// swallowing any fault (continue-on-error) so a transient SQL failure or + /// contention never crashes the central singleton — the next tick retries + /// the same window. + /// + private async Task PurgeWithRepositoryAsync(ISiteCallAuditRepository repository, DateTime threshold) + { try { var rowsDeleted = await repository.PurgeTerminalAsync(threshold).ConfigureAwait(false); @@ -527,10 +595,6 @@ public class SiteCallAuditActor : ReceiveActor "SiteCallAudit terminal-row purge failed (threshold {ThresholdUtc:o}); will retry next tick.", threshold); } - finally - { - scope?.Dispose(); - } } // ── Task 4: read-side (query / detail / KPI) ── diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs index e9e4d950..703b366c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs @@ -11,5 +11,108 @@ public class SiteCallAuditOptionsTests Assert.Equal(TimeSpan.FromMinutes(10), options.StuckAgeThreshold); // KPI interval mirrors NotificationOutboxOptions.DeliveredKpiWindow. Assert.Equal(TimeSpan.FromMinutes(1), options.KpiInterval); + + // Reconciliation tick cadence mirrors SiteAuditReconciliationOptions (#23). + Assert.Equal(TimeSpan.FromMinutes(5), options.ReconciliationInterval); + // Purge tick cadence mirrors AuditLogPurgeOptions. + Assert.Equal(TimeSpan.FromHours(24), options.PurgeInterval); + // Retention window mirrors the central audit-store retention policy. + Assert.Equal(365, options.RetentionDays); + } + + [Fact] + public void ResolvedReconciliationInterval_DefaultsToConfiguredValue() + { + var options = new SiteCallAuditOptions(); + + Assert.Equal(options.ReconciliationInterval, options.ResolvedReconciliationInterval); + } + + [Theory] + [InlineData(0)] + [InlineData(-5)] + public void ResolvedReconciliationInterval_ClampsZeroOrNegativeToMinimum(int configuredSeconds) + { + // A misconfigured 0 / negative interval must never resolve to TimeSpan.Zero + // (which would make Akka's ScheduleTellRepeatedlyCancelable spin). The + // documented floor is >= 1 second. + var options = new SiteCallAuditOptions + { + ReconciliationInterval = TimeSpan.FromSeconds(configuredSeconds), + }; + + Assert.True( + options.ResolvedReconciliationInterval >= TimeSpan.FromSeconds(1), + $"expected the resolved interval to clamp to >= 1s, got {options.ResolvedReconciliationInterval}"); + Assert.Equal(TimeSpan.FromSeconds(1), options.ResolvedReconciliationInterval); + } + + [Fact] + public void ResolvedReconciliationInterval_OverrideBypassesClamp() + { + // The test-only override drops the cadence below the clamp floor so unit + // tests can run the tick at millisecond cadence. + var sub1Second = TimeSpan.FromMilliseconds(50); + var options = new SiteCallAuditOptions + { + ReconciliationInterval = TimeSpan.FromMinutes(5), + ReconciliationIntervalOverride = sub1Second, + }; + + Assert.Equal(sub1Second, options.ResolvedReconciliationInterval); + } + + [Fact] + public void ResolvedPurgeInterval_DefaultsToConfiguredValue() + { + var options = new SiteCallAuditOptions(); + + Assert.Equal(options.PurgeInterval, options.ResolvedPurgeInterval); + } + + [Theory] + [InlineData(0)] + [InlineData(-30)] + public void ResolvedPurgeInterval_ClampsZeroOrNegativeToMinimum(int configuredSeconds) + { + // A misconfigured 0 / negative purge interval clamps to the documented + // >= 1 minute floor (the purge is daily, so a more generous floor than + // the reconciliation tick). + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromSeconds(configuredSeconds), + }; + + Assert.True( + options.ResolvedPurgeInterval >= TimeSpan.FromMinutes(1), + $"expected the resolved interval to clamp to >= 1min, got {options.ResolvedPurgeInterval}"); + Assert.Equal(TimeSpan.FromMinutes(1), options.ResolvedPurgeInterval); + } + + [Fact] + public void ResolvedPurgeInterval_BelowMinuteFloorClampsToMinimum() + { + // A positive-but-sub-minute config value still clamps to the 1-minute floor. + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromSeconds(5), + }; + + Assert.Equal(TimeSpan.FromMinutes(1), options.ResolvedPurgeInterval); + } + + [Fact] + public void ResolvedPurgeInterval_OverrideBypassesClamp() + { + // The test-only override drops the cadence below the clamp floor so unit + // tests can run the purge tick at millisecond cadence. + var subMinute = TimeSpan.FromMilliseconds(50); + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromHours(24), + PurgeIntervalOverride = subMinute, + }; + + Assert.Equal(subMinute, options.ResolvedPurgeInterval); } } From a00e43c4f9f74eee8a8bc4b3d3d863d26041ed06 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:23:04 -0400 Subject: [PATCH 10/14] feat(siteeventlog): emit alarm-category events on alarm transitions (M1.5) AlarmActor (computed) and NativeAlarmActor (native mirror) now fire-and-forget an 'alarm' site operational event on every state transition: - raise/activate: Error (priority/severity >= 700) or Warning - clear/return-to-normal, ack, inter-band transition: Info Both actors take a new optional IServiceProvider? ctor param (default null so existing direct-construction tests still compile); InstanceActor passes its _serviceProvider at the two Props.Create sites. Resolution is optional and the LogEventAsync call is fire-and-forget, so a logging failure never affects alarm evaluation. Rehydration replays are not re-logged. Adds a capturing FakeSiteEventLogger test helper + SingleServiceProvider. --- .../Actors/AlarmActor.cs | 62 +++++++++- .../Actors/InstanceActor.cs | 6 +- .../Actors/NativeAlarmActor.cs | 77 ++++++++++++- .../Actors/AlarmActorTests.cs | 107 ++++++++++++++++++ .../Actors/NativeAlarmActorTests.cs | 63 ++++++++++- .../TestSupport/FakeSiteEventLogger.cs | 62 ++++++++++ 6 files changed, 368 insertions(+), 9 deletions(-) create mode 100644 tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs index bafd20d9..e3020b23 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs @@ -1,10 +1,12 @@ using Akka.Actor; using Microsoft.CodeAnalysis.Scripting; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; using System.Globalization; using System.Text.Json; @@ -37,6 +39,14 @@ public class AlarmActor : ReceiveActor private readonly SiteRuntimeOptions _options; private readonly ILogger _logger; private readonly ISiteHealthCollector? _healthCollector; + private readonly IServiceProvider? _serviceProvider; + + /// + /// M1.5: priority at or above which a computed-alarm raise is logged as + /// Error to the site event log; below it, raises log as Warning. + /// Mirrors the 0–1000 alarm-severity scale. + /// + private const int ErrorPriorityThreshold = 700; private AlarmState _currentState = AlarmState.Normal; /// @@ -83,6 +93,9 @@ public class AlarmActor : ReceiveActor /// Pre-compiled trigger expression, or null for non-expression triggers. /// Seed attribute snapshot so static attributes evaluate correctly at startup. /// Optional health collector for surfacing alarm execution metrics. + /// Optional DI service provider used to resolve the optional + /// for M1.5 alarm operational events. Fire-and-forget; + /// a logging failure never affects alarm evaluation. public AlarmActor( string alarmName, string instanceName, @@ -94,7 +107,8 @@ public class AlarmActor : ReceiveActor ILogger logger, Script? compiledTriggerExpression = null, IReadOnlyDictionary? initialAttributes = null, - ISiteHealthCollector? healthCollector = null) + ISiteHealthCollector? healthCollector = null, + IServiceProvider? serviceProvider = null) { _alarmName = alarmName; _instanceName = instanceName; @@ -103,6 +117,7 @@ public class AlarmActor : ReceiveActor _options = options; _logger = logger; _healthCollector = healthCollector; + _serviceProvider = serviceProvider; _priority = alarmConfig.PriorityLevel; _onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName; _onTriggerCompiledScript = onTriggerCompiledScript; @@ -208,6 +223,9 @@ public class AlarmActor : ReceiveActor _instanceName, _alarmName, AlarmState.Active, _priority, DateTimeOffset.UtcNow); _instanceActor.Tell(alarmChanged); + // M1.5: operational `alarm` event — raise. Severity by priority. + LogAlarmEvent(RaiseSeverity(_priority), $"Alarm {_alarmName} activated (priority {_priority})"); + // Spawn AlarmExecutionActor if on-trigger script defined if (_onTriggerCompiledScript != null) { @@ -225,6 +243,9 @@ public class AlarmActor : ReceiveActor var alarmChanged = new AlarmStateChanged( _instanceName, _alarmName, AlarmState.Normal, _priority, DateTimeOffset.UtcNow); _instanceActor.Tell(alarmChanged); + + // M1.5: operational `alarm` event — return to normal. + LogAlarmEvent("Info", $"Alarm {_alarmName} cleared"); } } catch (Exception ex) @@ -265,6 +286,24 @@ public class AlarmActor : ReceiveActor }; _instanceActor.Tell(alarmChanged); + // M1.5: operational `alarm` event. Entering a band from Normal is a raise + // (severity by the band's priority); returning to None is a clear; a + // level-to-level escalation/de-escalation is an informational transition. + if (newLevel == AlarmLevel.None) + { + LogAlarmEvent("Info", $"Alarm {_alarmName} cleared ({previousLevel} → Normal)"); + } + else if (previousLevel == AlarmLevel.None) + { + LogAlarmEvent(RaiseSeverity(priority), + $"Alarm {_alarmName} activated at {newLevel} (priority {priority})"); + } + else + { + LogAlarmEvent("Info", + $"Alarm {_alarmName} transitioned {previousLevel} → {newLevel} (priority {priority})"); + } + if (previousLevel == AlarmLevel.None && newLevel != AlarmLevel.None && _onTriggerCompiledScript != null) @@ -273,6 +312,27 @@ public class AlarmActor : ReceiveActor } } + /// + /// M1.5: maps an alarm priority (0–1000) to a site-event severity for a + /// raise transition — Error at or above + /// , otherwise Warning. Clears and + /// inter-band transitions always log as Info. + /// + private static string RaiseSeverity(int priority) => + priority >= ErrorPriorityThreshold ? "Error" : "Warning"; + + /// + /// M1.5: fire-and-forget an alarm operational event to the optional + /// . Resolved optionally and never awaited so a + /// logging failure cannot affect alarm evaluation (matching the established + /// ScriptActor/ScriptExecutionActor pattern). + /// + private void LogAlarmEvent(string severity, string message) + { + _ = _serviceProvider?.GetService()?.LogEventAsync( + "alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message); + } + /// /// Returns the per-setpoint priority for the given level. Falls back to /// the alarm-level when the HiLo config did not diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs index 75909a8e..310eb9d8 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs @@ -763,7 +763,8 @@ public class InstanceActor : ReceiveActor _logger, triggerExpression, attributeSnapshot, - _healthCollector)); + _healthCollector, + _serviceProvider)); var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}"); _alarmActors[alarm.CanonicalName] = actorRef; @@ -793,7 +794,8 @@ public class InstanceActor : ReceiveActor _storage, _options, _logger, - nativeKind)); + nativeKind, + _serviceProvider)); var actorRef = Context.ActorOf(props, $"native-alarm-{nativeSource.CanonicalName}"); _nativeAlarmActors[nativeSource.CanonicalName] = actorRef; diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs index 28747cea..bca89d4d 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs @@ -1,11 +1,13 @@ using System.Text.Json; using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; @@ -35,6 +37,14 @@ public class NativeAlarmActor : ReceiveActor private readonly SiteRuntimeOptions _options; private readonly ILogger _logger; private readonly AlarmKind _nativeKind; + private readonly IServiceProvider? _serviceProvider; + + /// + /// M1.5: severity at or above which a native-alarm raise is logged as + /// Error to the site event log; below it, raises log as Warning. + /// Mirrors the 0–1000 condition-severity scale. + /// + private const int ErrorSeverityThreshold = 700; /// Current mirrored conditions, keyed by source reference. private readonly Dictionary _alarms = new(); @@ -54,6 +64,9 @@ public class NativeAlarmActor : ReceiveActor /// Logger for diagnostics. /// Alarm kind to stamp on emitted events (OPC UA vs MxAccess); set by the /// Instance Actor from the connection protocol. Defaults to . + /// Optional DI service provider used to resolve the optional + /// for M1.5 alarm operational events. Fire-and-forget; + /// a logging failure never affects the mirror. public NativeAlarmActor( ResolvedNativeAlarmSource source, string instanceName, @@ -62,7 +75,8 @@ public class NativeAlarmActor : ReceiveActor SiteStorageService storage, SiteRuntimeOptions options, ILogger logger, - AlarmKind nativeKind = AlarmKind.NativeOpcUa) + AlarmKind nativeKind = AlarmKind.NativeOpcUa, + IServiceProvider? serviceProvider = null) { _source = source; _instanceName = instanceName; @@ -72,6 +86,7 @@ public class NativeAlarmActor : ReceiveActor _options = options; _logger = logger; _nativeKind = nativeKind; + _serviceProvider = serviceProvider; Receive(HandleRehydration); Receive(HandleTransition); @@ -150,7 +165,10 @@ public class NativeAlarmActor : ReceiveActor condition, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, null, row.LastTransitionAt, string.Empty, string.Empty); _alarms[row.SourceReference] = t; - Emit(t, t.Condition); + // M1.5: rehydration replays last-known state on (re)start — surface it + // upward for the DebugView but do NOT re-log it as a fresh operational + // event (it is not a live transition). + Emit(t, t.Condition, logSiteEvent: false); } } @@ -277,8 +295,16 @@ public class NativeAlarmActor : ReceiveActor } } - /// Builds and tells the parent an enriched for a condition. - private void Emit(NativeAlarmTransition t, AlarmConditionState condition) + /// + /// Builds and tells the parent an enriched for a condition. + /// + /// The mirrored transition. + /// The condition state to surface (may differ from 's + /// own condition, e.g. a synthesised return-to-normal on snapshot swap). + /// M1.5: when true (live + snapshot transitions), emit an + /// alarm operational event. Suppressed for SQLite rehydration so a node restart does not + /// re-log every last-known condition. + private void Emit(NativeAlarmTransition t, AlarmConditionState condition, bool logSiteEvent = true) { var change = new AlarmStateChanged( _instanceName, @@ -301,6 +327,49 @@ public class NativeAlarmActor : ReceiveActor }; _instanceActor.Tell(change); + + if (logSiteEvent) + { + LogAlarmEvent(t, condition); + } + } + + /// + /// M1.5: fire-and-forget an alarm operational event mirroring a native + /// condition transition. An active condition is a raise (severity by the + /// condition's severity); an inactive condition is a return-to-normal; an + /// acknowledge transition is informational. Resolved optionally and never + /// awaited so a logging failure cannot affect the mirror (matching the + /// established ScriptActor/ScriptExecutionActor pattern). + /// + private void LogAlarmEvent(NativeAlarmTransition t, AlarmConditionState condition) + { + var logger = _serviceProvider?.GetService(); + if (logger == null) + { + return; + } + + string severity; + string message; + if (t.Kind == AlarmTransitionKind.Acknowledge) + { + severity = "Info"; + message = $"Native alarm {t.SourceReference} acknowledged"; + } + else if (condition.Active) + { + severity = condition.Severity >= ErrorSeverityThreshold ? "Error" : "Warning"; + message = $"Native alarm {t.SourceReference} active (severity {condition.Severity})"; + } + else + { + severity = "Info"; + message = $"Native alarm {t.SourceReference} returned to normal"; + } + + _ = logger.LogEventAsync( + "alarm", severity, _instanceName, $"NativeAlarmActor:{_source.CanonicalName}", message); } private void PersistUpsert(NativeAlarmTransition t) diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs index 150122a3..d7ee865d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs @@ -7,6 +7,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -877,6 +878,112 @@ public class AlarmActorTests : TestKit, IDisposable Assert.Equal(AlarmLevel.HighHigh, escalated.Level); } + // ── M1.5: site event log `alarm` category ────────────────────────────── + + [Fact] + public void AlarmActor_Raise_EmitsAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "HighTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Critical\"}", + PriorityLevel = 800 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "HighTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Critical", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + // Background fire-and-forget; allow it to land. + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + var row = rows[0]; + Assert.Equal("Error", row.Severity); // priority 800 → Error + Assert.Equal("Pump1", row.InstanceId); + Assert.Equal("AlarmActor:HighTemp", row.Source); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void AlarmActor_RaiseLowPriority_EmitsWarningAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "MinorTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Warn\"}", + PriorityLevel = 100 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "MinorTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Warn", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + Assert.Equal("Warning", rows[0].Severity); // priority 100 → Warning + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void AlarmActor_Clear_EmitsInfoAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "HighTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Critical\"}", + PriorityLevel = 800 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "HighTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Critical", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Normal", "Critical", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + clear + Assert.Equal("Error", rows[0].Severity); + Assert.Equal("Info", rows[1].Severity); // clear → Info + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void AlarmActor_MalformedTriggerConfig_DoesNotCrash() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs index ac9614b9..4cc5f594 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs @@ -9,6 +9,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -41,9 +42,10 @@ public class NativeAlarmActorTests : TestKit, IDisposable new(sourceRef, "T01", "AnalogLimit.Hi", kind, condition, "Process", "hi", "hi", "", "", null, time ?? DateTimeOffset.UtcNow, "92", "90"); - private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager) => + private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager, IServiceProvider? serviceProvider = null) => ActorOf(Props.Create(() => new NativeAlarmActor( - Source(), "inst", instanceActor, dclManager, _storage, _options, NullLogger.Instance))); + Source(), "inst", instanceActor, dclManager, _storage, _options, + NullLogger.Instance, AlarmKind.NativeOpcUa, serviceProvider))); [Fact] public void SubscribeOnStart_SendsRequestForSourceBinding() @@ -121,6 +123,63 @@ public class NativeAlarmActorTests : TestKit, IDisposable instance.ExpectNoMsg(TimeSpan.FromMilliseconds(300)); } + // ── M1.5: site event log `alarm` category ────────────────────────────── + + [Fact] + public void Raise_EmitsAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + var row = rows[0]; + Assert.Equal("Error", row.Severity); // severity 800 → Error + Assert.Equal("inst", row.InstanceId); + Assert.Equal("NativeAlarmActor:Pressure", row.Source); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void Clear_EmitsInfoAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + // Clear (inactive but not yet acked → stays mirrored, return-to-normal emit). + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Clear, + new AlarmConditionState(false, false, null, AlarmShelveState.Unshelved, false, 0), t0.AddSeconds(5)))); + instance.ExpectMsg(m => m.State == AlarmState.Normal); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + clear + Assert.Equal("Error", rows[0].Severity); + Assert.Equal("Info", rows[1].Severity); // return-to-normal → Info + }, TimeSpan.FromSeconds(2)); + } + void IDisposable.Dispose() { Shutdown(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs new file mode 100644 index 00000000..49239928 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs @@ -0,0 +1,62 @@ +using System.Collections.Concurrent; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; + +namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; + +/// +/// M1 Site Event Logging categories: a capturing fake +/// used by the actor tests to assert that the right operational events are emitted. +/// Thread-safe — the actors fire-and-forget LogEventAsync from background +/// tasks, so multiple captures can land concurrently. +/// +public sealed class FakeSiteEventLogger : ISiteEventLogger +{ + /// One captured invocation. + public sealed record Entry( + string EventType, + string Severity, + string? InstanceId, + string Source, + string Message, + string? Details); + + private readonly ConcurrentQueue _entries = new(); + + /// All captured events, in arrival order. + public IReadOnlyList Entries => _entries.ToArray(); + + /// Captured events filtered to a single category. + public IReadOnlyList OfType(string eventType) => + _entries.Where(e => e.EventType == eventType).ToArray(); + + /// + public Task LogEventAsync( + string eventType, + string severity, + string? instanceId, + string source, + string message, + string? details = null) + { + _entries.Enqueue(new Entry(eventType, severity, instanceId, source, message, details)); + return Task.CompletedTask; + } + + /// + public long FailedWriteCount => 0; +} + +/// +/// Minimal that resolves a single +/// — enough for the actors' optional +/// _serviceProvider?.GetService<ISiteEventLogger>() resolution +/// without pulling a full DI container into the actor tests. +/// +public sealed class SingleServiceProvider(ISiteEventLogger logger) : IServiceProvider +{ + private readonly ISiteEventLogger _logger = logger; + + /// + public object? GetService(Type serviceType) => + serviceType == typeof(ISiteEventLogger) ? _logger : null; +} From 09b9e8f25977d0bc17f5afe132cde7c2c03a084c Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:26:54 -0400 Subject: [PATCH 11/14] feat(siteeventlog): emit deployment + instance_lifecycle events (M1.6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeploymentManagerActor now fire-and-forgets a 'deployment' site operational event on deploy/enable/disable/delete outcomes (Info on success, Error on failure), source 'DeploymentManagerActor'. The disable/delete events are emitted from the existing PipeTo continuations (safe: reads only the immutable _serviceProvider and fire-and-forgets). InstanceActor now emits an 'instance_lifecycle' Info event in PreStart (started) and a new PostStop (stopped) — covering start/stop/enable/disable/redeploy/ failover transitions from the instance's own vantage point. Both actors already hold _serviceProvider; no ctor change. Resolution is optional and LogEventAsync is fire-and-forget so a logging failure never affects the deployment pipeline or instance lifecycle. --- .../Actors/DeploymentManagerActor.cs | 52 +++++++++++++ .../Actors/InstanceActor.cs | 30 ++++++++ .../Actors/DeploymentManagerActorTests.cs | 72 ++++++++++++++++- .../Actors/InstanceActorTests.cs | 77 +++++++++++++++++++ 4 files changed, 229 insertions(+), 2 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs index 0e9dab79..1866e878 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -1,4 +1,5 @@ using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView; @@ -10,6 +11,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management; using ZB.MOM.WW.ScadaBridge.Commons.Messages.ScriptExecution; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; @@ -456,6 +458,10 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers { if (result.Success) { + // M1.6: operational `deployment` event — deploy succeeded. + LogDeploymentEvent("Info", result.InstanceName, + $"Instance {result.InstanceName} deployed (deploymentId={result.DeploymentId})"); + result.OriginalSender.Tell(new DeploymentStatusResponse( result.DeploymentId, result.InstanceName, @@ -469,6 +475,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers "Failed to persist deployment {DeploymentId} for {Instance}: {Error}", result.DeploymentId, result.InstanceName, result.Error); + // M1.6: operational `deployment` event — deploy failed. + LogDeploymentEvent("Error", result.InstanceName, + $"Instance {result.InstanceName} deploy failed (deploymentId={result.DeploymentId})", + result.Error); + // Persistence failed — undo the optimistic actor creation and counter bump so // the site does not advertise an instance it cannot durably recover. if (_instanceActors.Remove(result.InstanceName, out var orphan)) @@ -504,7 +515,17 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _storage.SetInstanceEnabledAsync(instanceName, false).ContinueWith(t => { if (t.IsCompletedSuccessfully) + { _replicationActor?.Tell(new ReplicateConfigSetEnabled(instanceName, false)); + // M1.6: operational `deployment` event — disable succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} disabled"); + } + else + { + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} disable failed", + t.Exception?.GetBaseException().Message); + } return new InstanceLifecycleResponse( command.CommandId, @@ -551,6 +572,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers if (result.Error != null || result.Config == null) { var error = result.Error ?? $"No deployed config found for {instanceName}"; + // M1.6: operational `deployment` event — enable failed. + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} enable failed", error); result.OriginalSender.Tell(new InstanceLifecycleResponse( result.Command.CommandId, instanceName, false, error, DateTimeOffset.UtcNow)); return; @@ -562,6 +586,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } UpdateInstanceCounts(); + // M1.6: operational `deployment` event — enable succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} enabled"); + result.OriginalSender.Tell(new InstanceLifecycleResponse( result.Command.CommandId, instanceName, true, null, DateTimeOffset.UtcNow)); @@ -588,7 +615,17 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _storage.RemoveDeployedConfigAsync(instanceName).ContinueWith(t => { if (t.IsCompletedSuccessfully) + { _replicationActor?.Tell(new ReplicateConfigRemove(instanceName)); + // M1.6: operational `deployment` event — delete succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} deleted"); + } + else + { + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} delete failed", + t.Exception?.GetBaseException().Message); + } return new InstanceLifecycleResponse( command.CommandId, @@ -601,6 +638,21 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _logger.LogInformation("Instance {Instance} deleted", instanceName); } + /// + /// M1.6: fire-and-forget a deployment operational event to the optional + /// on a deploy/enable/disable/delete outcome. + /// Resolved optionally and never awaited so a logging failure cannot affect the + /// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor + /// pattern). Only reads the immutable _serviceProvider field, so it is + /// safe to call from the PipeTo continuations that report disable/delete + /// outcomes off the actor thread. + /// + private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null) + { + _ = _serviceProvider?.GetService()?.LogEventAsync( + "deployment", severity, instanceName, "DeploymentManagerActor", message, details); + } + /// /// DeploymentManager-006: answers a central query for the instance's /// currently-applied deployment identity. The site's deployed-config store diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs index 310eb9d8..e6f9a3fc 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs @@ -1,5 +1,6 @@ using Akka.Actor; using Microsoft.CodeAnalysis.Scripting; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView; @@ -9,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming; @@ -164,6 +166,11 @@ public class InstanceActor : ReceiveActor base.PreStart(); _logger.LogInformation("InstanceActor started for {Instance}", _instanceUniqueName); + // M1.6: operational `instance_lifecycle` event — instance started. + // An instance starts on deploy, on enable (DeploymentManager re-creates + // the actor), and on failover/restart; this single point covers them all. + LogLifecycleEvent($"Instance {_instanceUniqueName} started"); + // Asynchronously load static overrides from SQLite and pipe to self var self = Self; _storage.GetStaticOverridesAsync(_instanceUniqueName).ContinueWith(t => @@ -180,6 +187,29 @@ public class InstanceActor : ReceiveActor SubscribeToDcl(); } + /// + protected override void PostStop() + { + // M1.6: operational `instance_lifecycle` event — instance stopped. An + // instance stops on disable, delete, redeployment, and graceful shutdown; + // this single point covers them all. + LogLifecycleEvent($"Instance {_instanceUniqueName} stopped"); + base.PostStop(); + } + + /// + /// M1.6: fire-and-forget an instance_lifecycle operational event to the + /// optional . Resolved optionally and never + /// awaited so a logging failure cannot affect the instance lifecycle + /// (matching the established ScriptActor/ScriptExecutionActor pattern). + /// + private void LogLifecycleEvent(string message) + { + _ = _serviceProvider?.GetService()?.LogEventAsync( + "instance_lifecycle", "Info", _instanceUniqueName, + $"InstanceActor:{_instanceUniqueName}", message); + } + /// protected override SupervisorStrategy SupervisorStrategy() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs index 2b2191ca..d2a4613e 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs @@ -10,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; using System.Text.Json; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -44,7 +45,8 @@ public class DeploymentManagerActorTests : TestKit, IDisposable try { File.Delete(_dbFile); } catch { /* cleanup */ } } - private IActorRef CreateDeploymentManager(SiteRuntimeOptions? options = null) + private IActorRef CreateDeploymentManager( + SiteRuntimeOptions? options = null, IServiceProvider? serviceProvider = null) { options ??= new SiteRuntimeOptions(); return ActorOf(Props.Create(() => new DeploymentManagerActor( @@ -53,7 +55,12 @@ public class DeploymentManagerActorTests : TestKit, IDisposable _sharedScriptLibrary, null, // no stream manager in tests options, - NullLogger.Instance))); + NullLogger.Instance, + null, + null, + null, + serviceProvider, + null))); } private static string MakeConfigJson(string instanceName) @@ -171,6 +178,67 @@ public class DeploymentManagerActorTests : TestKit, IDisposable Assert.Equal("NewPump", response.InstanceUniqueName); } + // ── M1.6: site event log `deployment` category ───────────────────────── + + [Fact] + public async Task DeploymentManager_DeploySuccess_EmitsDeploymentSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var actor = CreateDeploymentManager(serviceProvider: new SingleServiceProvider(siteLog)); + + await Task.Delay(500); // wait for empty startup + + actor.Tell(new DeployInstanceCommand( + "dep-evt-1", "AuditedPump", "sha256:xyz", + MakeConfigJson("AuditedPump"), "admin", DateTimeOffset.UtcNow)); + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(DeploymentStatus.Success, response.Status); + + AwaitAssert(() => + { + var rows = siteLog.OfType("deployment"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "AuditedPump" && + r.Source == "DeploymentManagerActor" && + r.Message.Contains("deploy", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public async Task DeploymentManager_DisableEnableDelete_EmitDeploymentSiteEvents() + { + var siteLog = new FakeSiteEventLogger(); + var actor = CreateDeploymentManager(serviceProvider: new SingleServiceProvider(siteLog)); + + await Task.Delay(500); + + actor.Tell(new DeployInstanceCommand( + "dep-evt-2", "EvtPump", "sha256:abc", + MakeConfigJson("EvtPump"), "admin", DateTimeOffset.UtcNow)); + ExpectMsg(TimeSpan.FromSeconds(5)); + await Task.Delay(1000); + + actor.Tell(new DisableInstanceCommand("cmd-de1", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + await Task.Delay(300); + + actor.Tell(new EnableInstanceCommand("cmd-en1", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + await Task.Delay(300); + + actor.Tell(new DeleteInstanceCommand("cmd-del-evt", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + + AwaitAssert(() => + { + var rows = siteLog.OfType("deployment"); + Assert.Contains(rows, r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(rows, r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase)); + Assert.Contains(rows, r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + [Fact] public async Task DeploymentManager_Lifecycle_DisableEnableDelete() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs index b83ca94c..7ea11242 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs @@ -10,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; using System.Text.Json; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -58,6 +59,82 @@ public class InstanceActorTests : TestKit, IDisposable try { File.Delete(_dbFile); } catch { /* cleanup */ } } + // ── M1.6: site event log `instance_lifecycle` category ────────────────── + + [Fact] + public void InstanceActor_Start_EmitsInstanceLifecycleSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var config = new FlattenedConfiguration + { + InstanceUniqueName = "LifecyclePump", + Attributes = [new ResolvedAttribute { CanonicalName = "T", Value = "1", DataType = "Int32" }] + }; + + ActorOf(Props.Create(() => new InstanceActor( + "LifecyclePump", + JsonSerializer.Serialize(config), + _storage, + _compilationService, + _sharedScriptLibrary, + null, + _options, + NullLogger.Instance, + null, + null, + new SingleServiceProvider(siteLog)))); + + AwaitAssert(() => + { + var rows = siteLog.OfType("instance_lifecycle"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "LifecyclePump" && + r.Source == "InstanceActor:LifecyclePump" && + r.Message.Contains("started", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void InstanceActor_Stop_EmitsInstanceLifecycleSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var config = new FlattenedConfiguration + { + InstanceUniqueName = "StoppedPump", + Attributes = [new ResolvedAttribute { CanonicalName = "T", Value = "1", DataType = "Int32" }] + }; + + var actor = ActorOf(Props.Create(() => new InstanceActor( + "StoppedPump", + JsonSerializer.Serialize(config), + _storage, + _compilationService, + _sharedScriptLibrary, + null, + _options, + NullLogger.Instance, + null, + null, + new SingleServiceProvider(siteLog)))); + + // Let PreStart land its started event, then stop the actor. + AwaitAssert(() => Assert.NotEmpty(siteLog.OfType("instance_lifecycle")), + TimeSpan.FromSeconds(2)); + Watch(actor); + actor.Tell(PoisonPill.Instance); + ExpectTerminated(actor, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("instance_lifecycle"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "StoppedPump" && + r.Message.Contains("stopped", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void InstanceActor_LoadsAttributesFromConfig() { From d8b5dbb386d4aece1af8b84cf7e99295df6fdac8 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:31:04 -0400 Subject: [PATCH 12/14] feat(siteeventlog): emit store_and_forward + notification events (M1.7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StoreAndForwardService gains an optional ISiteEventLogger? ctor param (default null so the many direct-construction tests still compile) and, when wired, mirrors its own buffer/retry/park activity onto site operational events via the existing OnActivity hook (which already isolates a throwing subscriber, so a failing event log can never be misclassified as a transient delivery failure): - store_and_forward (ExternalSystem / CachedDbWrite): queued/retried/delivered/ parked. Warning on buffer/retry, Error on park, Info on retry-recovery; an immediate-success delivery is the hot path and is not logged. - notification (the site forward-to-central path): logged ONLY on forward FAILURE (buffered after the immediate forward threw) and on park, per the Component-SiteEventLogging spec — routine enqueue and forward-success are deliberately not logged (central's Notifications table is the audit record). Wired through AddStoreAndForward (resolves ISiteEventLogger optionally from DI); StoreAndForward project now references SiteEventLogging (acyclic: SiteEventLogging references only Commons). Also documents the 'notification' category on the ISiteEventLogger.LogEventAsync eventType param (folds in M1.8 doc fix). --- .../ISiteEventLogger.cs | 2 +- .../ServiceCollectionExtensions.cs | 9 +- .../StoreAndForwardService.cs | 104 ++++++++++- ....MOM.WW.ScadaBridge.StoreAndForward.csproj | 1 + .../StoreAndForwardSiteEventTests.cs | 168 ++++++++++++++++++ 5 files changed, 281 insertions(+), 3 deletions(-) create mode 100644 tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs index 07fc9ea4..630822ef 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs @@ -11,7 +11,7 @@ public interface ISiteEventLogger /// completes once the event is durably persisted and faults if /// the write fails, so callers that await it observe success or failure. /// - /// Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle + /// Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle, notification /// Info, Warning, or Error /// Optional instance ID associated with the event /// Source identifier, e.g., "ScriptActor:MonitorSpeed" diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs index e0cf2a46..163a0cb8 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs @@ -2,6 +2,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; namespace ZB.MOM.WW.ScadaBridge.StoreAndForward; @@ -49,13 +50,19 @@ public static class ServiceCollectionExtensions // observable in the central audit log instead of producing a // silent empty-string SourceSite. var siteId = siteContext?.SiteId ?? string.Empty; + // M1.7: optional site operational-event log. Resolved through + // GetService so a host (or test) that has not called + // AddSiteEventLogging simply gets null and the S&F activity stays + // a no-op for site-event purposes. + var siteEventLogger = sp.GetService(); return new StoreAndForwardService( storage, options, logger, replication, cachedCallObserver, - siteId); + siteId, + siteEventLogger); }); services.AddSingleton(sp => diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs index 6b082579..e335ea01 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs @@ -3,6 +3,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; using ZB.MOM.WW.ScadaBridge.Commons.Observability; using ZB.MOM.WW.ScadaBridge.Commons.Types; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; namespace ZB.MOM.WW.ScadaBridge.StoreAndForward; @@ -44,6 +45,15 @@ public class StoreAndForwardService /// private readonly ICachedCallLifecycleObserver? _cachedCallObserver; /// + /// M1.7: optional site operational-event log. When non-null the service maps + /// its own buffer/retry/park activity (the same activity that drives + /// ) onto site events — store_and_forward for the + /// cached-call categories and notification for the site's + /// forward-to-central notification path. Best-effort and fire-and-forget so a + /// failing logger never affects delivery bookkeeping. + /// + private readonly ISiteEventLogger? _siteEventLogger; + /// /// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the /// cached-call attempt context so the audit bridge can build the /// half of the telemetry packet. @@ -173,13 +183,20 @@ public class StoreAndForwardService /// Optional replication service for standby synchronization. /// Optional observer for cached call lifecycle events. /// The site identifier this service belongs to. + /// + /// M1.7: optional site operational-event log. When non-null, buffer/retry/park + /// activity is mirrored to site events (store_and_forward / + /// notification by category). Optional with a null default so the + /// many direct-construction tests still compile unchanged. + /// public StoreAndForwardService( StoreAndForwardStorage storage, StoreAndForwardOptions options, ILogger logger, ReplicationService? replication = null, ICachedCallLifecycleObserver? cachedCallObserver = null, - string siteId = "") + string siteId = "", + ISiteEventLogger? siteEventLogger = null) { _storage = storage; _options = options; @@ -191,6 +208,91 @@ public class StoreAndForwardService // audit pipeline keying off SourceSite) never see an empty string and // a misconfigured host is recognisable in the central log. _siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId; + _siteEventLogger = siteEventLogger; + + // M1.7: ride the existing activity hook to emit site operational events. + // RaiseActivity already isolates a throwing subscriber, so a failing + // event log can never be misclassified as a transient delivery failure + // (StoreAndForward-009). Only subscribe when a logger is wired so the + // legacy (test/central) construction path stays a no-op. + if (_siteEventLogger != null) + { + OnActivity += EmitSiteEvent; + } + } + + /// + /// M1.7: maps one store-and-forward activity to a site operational event, + /// following the Site Event Logging spec's per-category scope + /// (Component-SiteEventLogging.md §"Events Logged"): + /// + /// Cached-call categories + /// ( / + /// ) log under + /// store_and_forward for queued / retried / parked / retry-delivered + /// activity. + /// The site's notification forward-to-central path + /// () logs under + /// notification ONLY on a forward FAILURE (buffered after the + /// immediate forward threw) or a park (long-buffered / retries exhausted). + /// Routine enqueue and forward-success are deliberately NOT logged — central's + /// Notifications table is the record of audit; the site only fills the + /// in-transit blind spot when central is unreachable. + /// + /// A successful immediate cached-call Delivered is the normal hot path and + /// is not logged. + /// + private void EmitSiteEvent(string action, StoreAndForwardCategory category, string detail) + { + var logger = _siteEventLogger; + if (logger == null) + { + return; + } + + // An immediate-delivery success is the normal hot path, not an + // operational event. A retry-loop success (detail "Delivered to … after + // N retries") IS logged for cached calls — it records a recovery. + if (action == "Delivered" && detail.StartsWith("Immediate", StringComparison.Ordinal)) + { + return; + } + + if (category == StoreAndForwardCategory.Notification) + { + // Spec: log only forward-failure (the immediate forward threw and the + // notification was buffered for retry — detail "Buffered for retry:") + // and park. A routine "No handler registered, buffered" enqueue and a + // forward-success "Delivered" are deliberately NOT logged. + var isForwardFailure = action == "Queued" + && detail.StartsWith("Buffered for retry", StringComparison.Ordinal); + if (!isForwardFailure && action != "Parked") + { + return; + } + + var notifSeverity = action == "Parked" ? "Error" : "Warning"; + _ = logger.LogEventAsync( + "notification", notifSeverity, instanceId: null, + source: "StoreAndForwardService", + message: $"Notification {action.ToLowerInvariant()}: {detail}"); + return; + } + + // Cached-call categories: queued / retried / parked / retry-delivered. + // Severity: parking is an Error (delivery abandoned for retry purposes); + // queue/retry/requeue are Warning; a retry-loop Delivered is Info. + var severity = action switch + { + "Parked" => "Error", + "Delivered" => "Info", + _ => "Warning", + }; + + _ = logger.LogEventAsync( + "store_and_forward", severity, instanceId: null, + source: "StoreAndForwardService", + message: $"Operation {action.ToLowerInvariant()}: {detail}"); } /// diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj index c997e571..4f568eb4 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj @@ -17,6 +17,7 @@ + diff --git a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs new file mode 100644 index 00000000..2c3dcea7 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs @@ -0,0 +1,168 @@ +using System.Collections.Concurrent; +using Microsoft.Data.Sqlite; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; + +namespace ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests; + +/// +/// M1.7: the StoreAndForwardService emits site operational events for its own +/// buffer/park activity — store_and_forward for cached-call categories +/// (ExternalSystem / CachedDbWrite) and notification for the site's +/// notification forward-to-central path. Emission rides the existing +/// OnActivity hook and is best-effort (a failing logger never affects +/// delivery bookkeeping). +/// +public class StoreAndForwardSiteEventTests : IAsyncLifetime, IDisposable +{ + private sealed record Entry(string EventType, string Severity, string Source, string Message); + + private sealed class FakeSiteEventLogger : ISiteEventLogger + { + private readonly ConcurrentQueue _entries = new(); + public IReadOnlyList Entries => _entries.ToArray(); + public IReadOnlyList OfType(string t) => _entries.Where(e => e.EventType == t).ToArray(); + + public Task LogEventAsync(string eventType, string severity, string? instanceId, + string source, string message, string? details = null) + { + _entries.Enqueue(new Entry(eventType, severity, source, message)); + return Task.CompletedTask; + } + + public long FailedWriteCount => 0; + } + + private readonly SqliteConnection _keepAlive; + private readonly StoreAndForwardStorage _storage; + private readonly StoreAndForwardOptions _options; + private readonly FakeSiteEventLogger _siteLog = new(); + private readonly StoreAndForwardService _service; + + public StoreAndForwardSiteEventTests() + { + var dbName = $"SiteEvt_{Guid.NewGuid():N}"; + var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared"; + _keepAlive = new SqliteConnection(connStr); + _keepAlive.Open(); + + _storage = new StoreAndForwardStorage(connStr, NullLogger.Instance); + _options = new StoreAndForwardOptions + { + DefaultRetryInterval = TimeSpan.Zero, + DefaultMaxRetries = 1, + RetryTimerInterval = TimeSpan.FromMinutes(10) + }; + + _service = new StoreAndForwardService( + _storage, _options, NullLogger.Instance, + replication: null, cachedCallObserver: null, siteId: "site-a", + siteEventLogger: _siteLog); + } + + public async Task InitializeAsync() => await _storage.InitializeAsync(); + public Task DisposeAsync() => Task.CompletedTask; + public void Dispose() => _keepAlive.Dispose(); + + [Fact] + public async Task BufferForRetry_ExternalSystem_EmitsStoreAndForwardSiteEvent() + { + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => throw new HttpRequestException("transient")); + + await _service.EnqueueAsync(StoreAndForwardCategory.ExternalSystem, "api.example.com", """{}""", "Pump1"); + + var rows = _siteLog.OfType("store_and_forward"); + Assert.Contains(rows, r => r.Severity == "Warning" && + r.Source == "StoreAndForwardService" && + r.Message.Contains("queued", StringComparison.OrdinalIgnoreCase)); + // The cached-call categories must NOT surface as notification events. + Assert.Empty(_siteLog.OfType("notification")); + } + + [Fact] + public async Task ForwardFailure_Notification_EmitsNotificationSiteEvent() + { + // The site's notification role is forward-to-central. When the immediate + // forward to central throws (central unreachable), the notification is + // buffered for retry — a forward FAILURE, which the spec says to log as a + // `notification` site event (filling the in-transit blind spot). + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => throw new HttpRequestException("central unreachable")); + + await _service.EnqueueAsync(StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1"); + + var rows = _siteLog.OfType("notification"); + Assert.Contains(rows, r => r.Severity == "Warning" && + r.Source == "StoreAndForwardService" && + r.Message.Contains("queued", StringComparison.OrdinalIgnoreCase)); + // A notification forward-failure is not a store_and_forward (cached-call) event. + Assert.Empty(_siteLog.OfType("store_and_forward")); + } + + [Fact] + public async Task RoutineEnqueue_Notification_DoesNotEmitSiteEvent() + { + // Spec: routine enqueue / forward-success on the notification path are + // deliberately NOT logged — central's Notifications table is the audit + // record of record. A successful immediate forward emits no site event. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => Task.FromResult(true)); + + await _service.EnqueueAsync(StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1"); + + Assert.Empty(_siteLog.OfType("notification")); + } + + [Fact] + public async Task Park_Notification_EmitsErrorNotificationSiteEvent() + { + // A long-buffered notification that exhausts retries is parked — the spec + // logs this as a `notification` event (Error severity). + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => throw new HttpRequestException("central unreachable")); + + await _service.EnqueueAsync( + StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1", + attemptImmediateDelivery: false, maxRetries: 1); + + await _service.RetryPendingMessagesAsync(); + + var rows = _siteLog.OfType("notification"); + Assert.Contains(rows, r => r.Severity == "Error" && + r.Message.Contains("parked", StringComparison.OrdinalIgnoreCase)); + } + + [Fact] + public async Task Park_ExternalSystem_EmitsErrorStoreAndForwardSiteEvent() + { + // MaxRetries = 1 → the first sweep retry parks the message. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => throw new HttpRequestException("transient")); + + await _service.EnqueueAsync( + StoreAndForwardCategory.ExternalSystem, "api.example.com", """{}""", "Pump1", + attemptImmediateDelivery: false, maxRetries: 1); + + await _service.RetryPendingMessagesAsync(); + + var rows = _siteLog.OfType("store_and_forward"); + Assert.Contains(rows, r => r.Severity == "Error" && + r.Message.Contains("parked", StringComparison.OrdinalIgnoreCase)); + } + + [Fact] + public async Task DeliveredImmediately_DoesNotEmitSiteEvent() + { + // A successful immediate delivery is the normal hot path — it is not a + // store-and-forward buffering event, so no operational event is logged. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => Task.FromResult(true)); + + await _service.EnqueueAsync(StoreAndForwardCategory.ExternalSystem, "api", """{}""", "Pump1"); + + Assert.Empty(_siteLog.OfType("store_and_forward")); + Assert.Empty(_siteLog.OfType("notification")); + } +} From e74c3aef23a18d1324b50b7a56232b74dd549a0d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:33:31 -0400 Subject: [PATCH 13/14] feat(siteeventlog): emit script started/completed Info events (M1.8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ScriptExecutionActor previously emitted only an Error 'script' event on failure. It now also fire-and-forgets an Info 'script' event when execution starts (right before RunAsync) and when it completes successfully — giving the operational log the full started/completed/failed lifecycle. Uses the already-resolved siteEventLogger; fire-and-forget so the event log can never block or fault the script's own run. Extends the SingleServiceProvider test helper to also serve IServiceScopeFactory (returning a self-scope) so ScriptExecutionActor's serviceProvider.CreateScope() reaches the logging hot path in tests instead of throwing into the catch. --- .../Actors/ScriptExecutionActor.cs | 12 ++++ .../Actors/ExecutionActorTests.cs | 66 +++++++++++++++++++ .../TestSupport/FakeSiteEventLogger.cs | 27 +++++++- 3 files changed, 102 insertions(+), 3 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs index 77d2e58f..22cc6034 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs @@ -217,6 +217,13 @@ public class ScriptExecutionActor : ReceiveActor Scope = scope }; + // M1.8: operational `script` event — execution started. Fire-and-forget + // (the `_ =` discards the task) so the event log can never block or + // fault the script's own run; mirrors the existing Error-path emit. + _ = siteEventLogger?.LogEventAsync( + "script", "Info", instanceName, $"ScriptActor:{scriptName}", + $"Script '{scriptName}' on instance '{instanceName}' started"); + var state = await compiledScript.RunAsync(globals, cts.Token); // Send result to requester if this was an Ask-based call @@ -225,6 +232,11 @@ public class ScriptExecutionActor : ReceiveActor replyTo.Tell(new ScriptCallResult(correlationId, true, state.ReturnValue, null)); } + // M1.8: operational `script` event — execution completed successfully. + _ = siteEventLogger?.LogEventAsync( + "script", "Info", instanceName, $"ScriptActor:{scriptName}", + $"Script '{scriptName}' on instance '{instanceName}' completed"); + // Notify parent of completion parent.Tell(new ScriptActor.ScriptExecutionCompleted(scriptName, true, null)); } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs index 8d28de6d..22fe87c6 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs @@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Scripts; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -71,6 +72,71 @@ public class ExecutionActorTests : TestKit, IDisposable ExpectTerminated(exec, TimeSpan.FromSeconds(5)); } + // ── M1.8: site event log `script` started/completed ──────────────────── + + [Fact] + public void ScriptExecutionActor_Success_EmitsScriptStartedAndCompletedInfoEvents() + { + var compiled = CompileScript("return 7 * 6;"); + var replyTo = CreateTestProbe(); + var instanceActor = CreateTestProbe(); + var siteLog = new FakeSiteEventLogger(); + + var exec = ActorOf(Props.Create(() => new ScriptExecutionActor( + "Answer", "Inst1", compiled, null, 0, + instanceActor.Ref, _sharedLibrary, Options(), + replyTo.Ref, "corr-evt-1", NullLogger.Instance, + ScriptScope.Root, null, new SingleServiceProvider(siteLog)))); + + Watch(exec); + replyTo.ExpectMsg(TimeSpan.FromSeconds(10)); + ExpectTerminated(exec, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("script"); + // started + completed, both Info, in order. + Assert.Equal(2, rows.Count); + Assert.All(rows, r => + { + Assert.Equal("Info", r.Severity); + Assert.Equal("Inst1", r.InstanceId); + Assert.Equal("ScriptActor:Answer", r.Source); + }); + Assert.Contains("started", rows[0].Message, StringComparison.OrdinalIgnoreCase); + Assert.Contains("completed", rows[1].Message, StringComparison.OrdinalIgnoreCase); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void ScriptExecutionActor_Failure_EmitsStartedInfoThenErrorEvent() + { + var compiled = CompileScript("throw new InvalidOperationException(\"boom\");"); + var replyTo = CreateTestProbe(); + var instanceActor = CreateTestProbe(); + var siteLog = new FakeSiteEventLogger(); + + var exec = ActorOf(Props.Create(() => new ScriptExecutionActor( + "Bad", "Inst1", compiled, null, 0, + instanceActor.Ref, _sharedLibrary, Options(), + replyTo.Ref, "corr-evt-2", NullLogger.Instance, + ScriptScope.Root, null, new SingleServiceProvider(siteLog)))); + + Watch(exec); + replyTo.ExpectMsg(TimeSpan.FromSeconds(10)); + ExpectTerminated(exec, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("script"); + // started (Info) + failed (Error) — no completed. + Assert.Equal(2, rows.Count); + Assert.Equal("Info", rows[0].Severity); + Assert.Contains("started", rows[0].Message, StringComparison.OrdinalIgnoreCase); + Assert.Equal("Error", rows[1].Severity); + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void ScriptExecutionActor_ScriptThrows_RepliesFailureAndStops() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs index 49239928..7617fe14 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs @@ -1,4 +1,5 @@ using System.Collections.Concurrent; +using Microsoft.Extensions.DependencyInjection; using ZB.MOM.WW.ScadaBridge.SiteEventLogging; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; @@ -51,12 +52,32 @@ public sealed class FakeSiteEventLogger : ISiteEventLogger /// — enough for the actors' optional /// _serviceProvider?.GetService<ISiteEventLogger>() resolution /// without pulling a full DI container into the actor tests. +/// +/// Also serves (returning a scope that just +/// re-exposes this provider) so callers that do +/// serviceProvider.CreateScope() — e.g. ScriptExecutionActor — +/// don't throw before they reach the logging hot path. +/// /// -public sealed class SingleServiceProvider(ISiteEventLogger logger) : IServiceProvider +public sealed class SingleServiceProvider(ISiteEventLogger logger) + : IServiceProvider, IServiceScopeFactory, IServiceScope { private readonly ISiteEventLogger _logger = logger; /// - public object? GetService(Type serviceType) => - serviceType == typeof(ISiteEventLogger) ? _logger : null; + public object? GetService(Type serviceType) + { + if (serviceType == typeof(ISiteEventLogger)) return _logger; + if (serviceType == typeof(IServiceScopeFactory)) return this; + return null; + } + + /// + public IServiceScope CreateScope() => this; + + /// + public IServiceProvider ServiceProvider => this; + + /// + public void Dispose() { } } From e5534fddcadbe09bd0d36cbfa644497e189176fa Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 12:45:00 -0400 Subject: [PATCH 14/14] fix(siteeventlog): suppress snapshot-resync alarm re-emit + coverage + hardening (review) --- .../Actors/AlarmActor.cs | 21 +++- .../Actors/DeploymentManagerActor.cs | 15 ++- .../Actors/NativeAlarmActor.cs | 9 +- .../StoreAndForwardService.cs | 25 +++-- .../Actors/DeploymentManagerActorTests.cs | 23 +++-- .../Actors/NativeAlarmActorTests.cs | 96 +++++++++++++++++++ 6 files changed, 166 insertions(+), 23 deletions(-) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs index e3020b23..ca9e2db6 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs @@ -41,6 +41,17 @@ public class AlarmActor : ReceiveActor private readonly ISiteHealthCollector? _healthCollector; private readonly IServiceProvider? _serviceProvider; + /// + /// M1.5: the optional site operational-event log, resolved once from + /// at construction and cached. The + /// registration is process-lifetime (a singleton), so resolving once on + /// the actor's own thread is both correct and cheaper than a per-event + /// GetService on the hot path. null when no provider was + /// supplied (the test/no-logging path) — then + /// no-ops. + /// + private readonly ISiteEventLogger? _siteEventLogger; + /// /// M1.5: priority at or above which a computed-alarm raise is logged as /// Error to the site event log; below it, raises log as Warning. @@ -118,6 +129,9 @@ public class AlarmActor : ReceiveActor _logger = logger; _healthCollector = healthCollector; _serviceProvider = serviceProvider; + // M1.5: resolve the optional site event logger once and cache it, + // rather than calling GetService on every alarm transition. + _siteEventLogger = serviceProvider?.GetService(); _priority = alarmConfig.PriorityLevel; _onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName; _onTriggerCompiledScript = onTriggerCompiledScript; @@ -323,13 +337,14 @@ public class AlarmActor : ReceiveActor /// /// M1.5: fire-and-forget an alarm operational event to the optional - /// . Resolved optionally and never awaited so a - /// logging failure cannot affect alarm evaluation (matching the established + /// (resolved once at construction and cached + /// in ). Never awaited so a logging failure + /// cannot affect alarm evaluation (matching the established /// ScriptActor/ScriptExecutionActor pattern). /// private void LogAlarmEvent(string severity, string message) { - _ = _serviceProvider?.GetService()?.LogEventAsync( + _ = _siteEventLogger?.LogEventAsync( "alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message); } diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs index 1866e878..10a6aae6 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -643,9 +643,18 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers /// on a deploy/enable/disable/delete outcome. /// Resolved optionally and never awaited so a logging failure cannot affect the /// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor - /// pattern). Only reads the immutable _serviceProvider field, so it is - /// safe to call from the PipeTo continuations that report disable/delete - /// outcomes off the actor thread. + /// pattern). + /// + /// Thread-safety: the disable () and delete + /// () paths call this from a + /// + /// continuation that runs on a thread-pool thread, NOT on the actor thread — + /// so it must touch only immutable, thread-safe state. It does: the only + /// field it reads is the readonly _serviceProvider captured at + /// construction (the resolved is a process + /// singleton). No actor-private mutable state is referenced, which is what + /// makes calling it off the actor thread safe. + /// /// private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null) { diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs index bca89d4d..63a354ed 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs @@ -212,7 +212,14 @@ public class NativeAlarmActor : ReceiveActor { _alarms[sourceRef] = t; PersistUpsert(t); - Emit(t, t.Condition); + // M1.5: a snapshot replay is a re-sync of the source's current + // active set on (re)subscribe, NOT a live transition — surface it + // upward for the DebugView but do NOT re-log an `alarm` operational + // event. Otherwise every DCL reconnect would re-emit an `alarm` + // event for every already-active native condition (the + // synthesised return-to-normal above IS a real state change and + // keeps logSiteEvent: true). + Emit(t, t.Condition, logSiteEvent: false); } _snapshotBuffer.Clear(); diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs index e335ea01..b1c3b9e2 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs @@ -82,6 +82,18 @@ public class StoreAndForwardService /// recognisable instead of an unattributable empty string. /// public const string UnknownSiteSentinel = "$unknown-site"; + + /// + /// M1.7: the detail-string prefix written by + /// when an immediate forward attempt throws and the message is buffered for + /// the retry sweep. matches on this same prefix + /// to distinguish a forward failure (logged) from a routine + /// no-handler enqueue (not logged), so both the construction site and the + /// check reference this single constant rather than duplicating the + /// literal — keeping the two ends from drifting apart. + /// + private const string BufferedForRetryDetailPrefix = "Buffered for retry"; + private Timer? _retryTimer; private int _retryInProgress; @@ -261,11 +273,12 @@ public class StoreAndForwardService if (category == StoreAndForwardCategory.Notification) { // Spec: log only forward-failure (the immediate forward threw and the - // notification was buffered for retry — detail "Buffered for retry:") - // and park. A routine "No handler registered, buffered" enqueue and a - // forward-success "Delivered" are deliberately NOT logged. + // notification was buffered for retry — detail prefixed + // BufferedForRetryDetailPrefix) and park. A routine "No handler + // registered, buffered" enqueue and a forward-success "Delivered" + // are deliberately NOT logged. var isForwardFailure = action == "Queued" - && detail.StartsWith("Buffered for retry", StringComparison.Ordinal); + && detail.StartsWith(BufferedForRetryDetailPrefix, StringComparison.Ordinal); if (!isForwardFailure && action != "Parked") { return; @@ -536,7 +549,7 @@ public class StoreAndForwardService message.LastError = ex.Message; await BufferAsync(message); - RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})"); + RaiseActivity("Queued", category, $"{BufferedForRetryDetailPrefix}: {target} ({ex.Message})"); return new StoreAndForwardResult(true, message.Id, true); } } @@ -553,7 +566,7 @@ public class StoreAndForwardService await BufferAsync(message); RaiseActivity("Queued", category, attemptImmediateDelivery ? $"No handler registered, buffered: {target}" - : $"Buffered for retry: {target}"); + : $"{BufferedForRetryDetailPrefix}: {target}"); return new StoreAndForwardResult(true, message.Id, true); } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs index d2a4613e..6b57e936 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs @@ -219,24 +219,27 @@ public class DeploymentManagerActorTests : TestKit, IDisposable ExpectMsg(TimeSpan.FromSeconds(5)); await Task.Delay(1000); + // The deployment site events are emitted fire-and-forget off the actor + // thread (LogDeploymentEvent runs in a ContinueWith continuation), so + // poll for each event with AwaitAssert rather than a bare Task.Delay — + // a fixed sleep is racy under CI load. actor.Tell(new DisableInstanceCommand("cmd-de1", "EvtPump", DateTimeOffset.UtcNow)); Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); - await Task.Delay(300); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); actor.Tell(new EnableInstanceCommand("cmd-en1", "EvtPump", DateTimeOffset.UtcNow)); Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); - await Task.Delay(300); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); actor.Tell(new DeleteInstanceCommand("cmd-del-evt", "EvtPump", DateTimeOffset.UtcNow)); Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); - - AwaitAssert(() => - { - var rows = siteLog.OfType("deployment"); - Assert.Contains(rows, r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase)); - Assert.Contains(rows, r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase)); - Assert.Contains(rows, r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase)); - }, TimeSpan.FromSeconds(2)); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); } [Fact] diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs index 4cc5f594..abcd0766 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs @@ -1,3 +1,4 @@ +using System.Text.Json; using Akka.Actor; using Akka.TestKit.Xunit2; using Microsoft.Extensions.Logging.Abstractions; @@ -180,6 +181,101 @@ public class NativeAlarmActorTests : TestKit, IDisposable }, TimeSpan.FromSeconds(2)); } + [Fact] + public async Task Rehydration_DoesNotEmitSiteEvent() + { + // Pre-populate SQLite with an active condition so the actor rehydrates + // it on PreStart. Rehydration replays last-known state — it is NOT a + // live transition, so it must surface upward (for the DebugView) but + // must NOT re-log an `alarm` operational event. + var condition = new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800); + await _storage.UpsertNativeAlarmAsync( + "inst", "Pressure", "T01.Hi", + JsonSerializer.Serialize(condition), DateTimeOffset.UtcNow); + + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + + // The rehydrated condition is surfaced upward... + var emitted = instance.ExpectMsg(TimeSpan.FromSeconds(2)); + Assert.Equal("T01.Hi", emitted.SourceReference); + Assert.Equal(AlarmState.Active, emitted.State); + dcl.ExpectMsg(); + + // ...but no `alarm` operational event is logged for it. + AwaitAssert( + () => Assert.Empty(siteLog.OfType("alarm")), + TimeSpan.FromSeconds(1)); + } + + [Fact] + public void SnapshotSwap_ExistingActiveCondition_DoesNotReEmit() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + // Live raise — the one and only `alarm` event we expect. + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + AwaitAssert(() => Assert.Single(siteLog.OfType("alarm")), TimeSpan.FromSeconds(2)); + + // A reconnect snapshot that RE-INCLUDES the same still-active condition is + // a re-sync, not a live transition. It must NOT re-log a second `alarm` + // event (regression for the spurious-reconnect-event bug). + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Snapshot, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.SnapshotComplete, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + + // The snapshot still surfaces the condition upward (DebugView re-sync)... + instance.ExpectMsg(m => m.SourceReference == "T01.Hi" && m.State == AlarmState.Active); + + // ...but the `alarm` event count stays at exactly 1 — no re-emit. + Thread.Sleep(200); // give any spurious fire-and-forget log time to land + Assert.Single(siteLog.OfType("alarm")); + } + + [Fact] + public void Acknowledge_EmitsInfoAlarmSiteEventMentioningAcknowledged() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + // Operator acknowledges the still-active condition. The Acknowledge + // branch of LogAlarmEvent logs Info and mentions "acknowledged". + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Acknowledge, + new AlarmConditionState(true, true, null, AlarmShelveState.Unshelved, false, 800), t0.AddSeconds(5)))); + instance.ExpectMsg(); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + acknowledge + var ack = rows[1]; + Assert.Equal("Info", ack.Severity); + Assert.Contains("acknowledged", ack.Message, StringComparison.OrdinalIgnoreCase); + }, TimeSpan.FromSeconds(2)); + } + void IDisposable.Dispose() { Shutdown();