diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs index 43d5bc74..0ba1bc00 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/AuditLogPurgeOptions.cs @@ -17,8 +17,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// /// /// exists for tests to drop the cadence to -/// milliseconds without polluting the production config surface; production -/// binds only. +/// milliseconds; production config is expected to set +/// only. Because this options class is Bind-ed wholesale, a config value +/// at AuditLog:Purge:IntervalOverride would bind if present (and would +/// bypass the minimum clamp) — operators must not set it. /// /// public sealed class AuditLogPurgeOptions @@ -29,15 +31,44 @@ public sealed class AuditLogPurgeOptions /// /// Test-only override for finer control over the tick cadence than /// whole-hour resolution allows. When non-null, takes precedence over - /// . Not bound from config — production - /// config exposes only. + /// AND bypasses the + /// minimum clamp (so tests can use millisecond cadences). Production + /// config exposes only and never sets this + /// knob — but because the options class is Bind-ed wholesale, a + /// config value at AuditLog:Purge:IntervalOverride WOULD bind if + /// present; operators must not set it. /// public TimeSpan? IntervalOverride { get; set; } /// - /// Resolves the effective tick interval, honouring the test override - /// when set. Falls back to . + /// Minimum interval the config-bound can + /// resolve to. Clamps a misconfigured IntervalHours: 0 (or a + /// negative value) away from — a zero + /// interval would make Akka's ScheduleTellRepeatedlyCancelable + /// spin, looping the partition drop/rebuild dance into a sustained SQL + /// outage. The test-only bypasses this + /// clamp so unit tests can still drop the cadence to milliseconds. /// - public TimeSpan Interval => - IntervalOverride ?? TimeSpan.FromHours(IntervalHours); + private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromMinutes(1); + + /// + /// Resolves the effective tick interval, honouring the test override + /// when set. Falls back to , clamped to at + /// least so a zero/negative config + /// value can never yield (which would spin + /// the scheduler). + /// + public TimeSpan Interval + { + get + { + if (IntervalOverride is { } overrideValue) + { + return overrideValue; + } + + var resolved = TimeSpan.FromHours(IntervalHours); + return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved; + } + } } diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs new file mode 100644 index 00000000..bad3e88f --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullAuditEventsClient.cs @@ -0,0 +1,289 @@ +using System.Collections.Concurrent; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.Communication; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse; +using PullAuditEventsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullAuditEventsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production (Audit Log #23, M6) that the +/// central uses to pull the next +/// reconciliation batch from a site over the PullAuditEvents unary gRPC +/// RPC served by SiteStreamGrpcServer. +/// +/// +/// +/// Endpoint resolution. The actor passes only a siteId; this +/// client resolves it to a gRPC authority via +/// () on every call so a NodeA→NodeB +/// failover flip or an edited site address takes effect on the next tick — the +/// same liveness guarantee SiteStreamGrpcClientFactory gives the +/// real-time stream. A site with no registered endpoint yields an empty +/// response (no dial); reconciliation simply has nothing to pull from it. +/// +/// +/// Fault tolerance. Per the +/// contract, tolerable transport faults (connection refused / site offline = +/// , slow site = , +/// shutdown = , plus bare +/// / SocketException before a gRPC +/// status is established) are caught and collapsed to an empty response — one +/// offline site must never sink the rest of the reconciliation tick. Any other +/// fault (e.g. a malformed reply that fails DTO mapping) is also swallowed to +/// empty: audit reconciliation is best-effort and a throw would only get +/// re-caught by the actor's own per-site guard. +/// +/// +/// Testability. The unary call is reached through the +/// seam. Production binds +/// (one cached +/// per endpoint, keepalive from ); unit tests +/// inject a fake invoker so no real HTTP/2 endpoint is required. +/// +/// +public sealed class GrpcPullAuditEventsClient : IPullAuditEventsClient +{ + private readonly ISiteEnumerator _sites; + private readonly IPullAuditEventsInvoker _invoker; + private readonly ILogger _logger; + + /// + /// Creates the client over the given site enumerator and unary-call invoker. + /// + /// Resolves a siteId to its gRPC endpoint. + /// Seam that issues the PullAuditEvents unary RPC against a resolved endpoint. + /// Logger for transport-fault diagnostics. + public GrpcPullAuditEventsClient( + ISiteEnumerator sites, + IPullAuditEventsInvoker invoker, + ILogger logger) + { + _sites = sites ?? throw new ArgumentNullException(nameof(sites)); + _invoker = invoker ?? throw new ArgumentNullException(nameof(invoker)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public async Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct) + { + var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false); + if (endpoint is null) + { + // No gRPC address registered for the site — absence of an address is + // a configuration decision (mirrors ISiteEnumerator's own contract), + // not a runtime error, so there is simply nothing to pull. + _logger.LogDebug( + "PullAuditEvents skipped: no gRPC endpoint registered for site {SiteId}.", siteId); + return Empty; + } + + var request = new ProtoPullRequest + { + // ReadPendingSinceAsync treats DateTime.MinValue as "from the start"; + // EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind). + SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)), + BatchSize = batchSize, + }; + + ProtoPullResponse reply; + try + { + reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false); + } + catch (RpcException ex) when (IsTolerable(ex.StatusCode)) + { + _logger.LogDebug(ex, + "PullAuditEvents tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.", + siteId, endpoint, ex.StatusCode); + return Empty; + } + catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException) + { + _logger.LogDebug(ex, + "PullAuditEvents connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + catch (OperationCanceledException) + { + // Reconciliation tick was cancelled — either the caller's token + // (host shutdown / scope dispose) or an internal gRPC deadline / + // linked-CTS cancellation. Both are tolerable for a best-effort + // pull; collapse to empty rather than letting an internal + // cancellation land noisily in the catch-all below. + return Empty; + } + catch (Exception ex) + { + // Any other fault (e.g. a malformed reply that fails DTO mapping + // below would actually surface here only if mapping moved inline, + // but a non-RpcException transport fault wrapper lands here too). + // Audit reconciliation is best-effort; swallow to empty rather than + // throw — the actor's per-site guard would only re-catch it. + _logger.LogWarning(ex, + "PullAuditEvents unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + + // Map proto DTOs to canonical AuditEvent records and order oldest-first + // (the wire is already ordered by the site queue, but the + // IPullAuditEventsClient contract is explicit, so sort defensively). + var events = reply.Events + .Select(AuditEventDtoMapper.FromDto) + .OrderBy(e => e.OccurredAtUtc) + .ToList(); + + return new PullAuditEventsResponse(events, reply.MoreAvailable); + } + + private async Task ResolveEndpointAsync(string siteId, CancellationToken ct) + { + var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false); + foreach (var site in sites) + { + if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) && + !string.IsNullOrWhiteSpace(site.GrpcEndpoint)) + { + return site.GrpcEndpoint; + } + } + return null; + } + + private static readonly PullAuditEventsResponse Empty = + new(Array.Empty(), MoreAvailable: false); + + private static bool IsTolerable(StatusCode code) => code is + StatusCode.Unavailable or + StatusCode.DeadlineExceeded or + StatusCode.Cancelled; + + // All ScadaBridge timestamps are UTC by invariant. A non-UTC cursor (the + // reconciliation cursor starts at DateTime.MinValue, Kind=Unspecified) is + // therefore treated AS UTC — never ToUniversalTime()-converted: on a host + // with a positive UTC offset MinValue.ToUniversalTime() underflows and + // Timestamp.FromDateTime throws, crashing the first pull for every site. + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); + + /// + /// Seam over the PullAuditEvents unary gRPC call against a resolved + /// site endpoint. Extracted so can + /// be unit-tested without a real . Production binds + /// . + /// + public interface IPullAuditEventsInvoker + { + /// + /// Issues the PullAuditEvents unary RPC against . + /// May throw / + /// on transport faults — the caller classifies and swallows tolerable ones. + /// + /// The site gRPC authority (e.g. http://site-a:8083). + /// The wire-format pull request. + /// Cancellation token. + /// The wire-format pull response. + Task InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct); + } +} + +/// +/// Production : +/// caches one per endpoint (keepalive from +/// , mirroring SiteStreamGrpcClient) +/// and issues the unary PullAuditEventsAsync call. The cache is keyed by +/// endpoint string, so a changed site address (NodeA→NodeB failover flip / an +/// edited gRPC address) is reached as soon as the resolver hands the new +/// endpoint to — it creates a fresh channel for the +/// new address. Unlike SiteStreamGrpcClientFactory (keyed by siteId, +/// which actively evicts a re-keyed client), the channel for the previous +/// address is NOT actively evicted here; it lingers idle until +/// . Idle channels hold no streams, so this is a minor +/// cache footprint cost, not a correctness or liveness gap. +/// +public sealed class GrpcPullAuditEventsInvoker + : GrpcPullAuditEventsClient.IPullAuditEventsInvoker, IDisposable +{ + private readonly ConcurrentDictionary _channels = new(StringComparer.Ordinal); + private readonly CommunicationOptions _options; + + /// + /// Creates the invoker using default . + /// + public GrpcPullAuditEventsInvoker() + : this(new CommunicationOptions()) + { + } + + /// + /// Creates the invoker, applying the configured gRPC keepalive settings to + /// every channel it opens. + /// + /// Communication options supplying gRPC keepalive timings. + public GrpcPullAuditEventsInvoker(CommunicationOptions options) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + public async Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + var channel = GetOrCreateChannel(endpoint); + var client = new SiteStreamService.SiteStreamServiceClient(channel); + using var call = client.PullAuditEventsAsync(request, cancellationToken: ct); + return await call.ResponseAsync.ConfigureAwait(false); + } + + // Race-safe channel cache. ConcurrentDictionary.GetOrAdd(key, valueFactory) + // does NOT serialize the factory, so two concurrent first dials of the same + // endpoint can both build a GrpcChannel (each holds an HTTP/2 connection + // pool) and the loser would leak. Create-then-GetOrAdd-then-dispose-if-lost + // mirrors SiteStreamGrpcClientFactory: only the channel actually installed + // survives; a channel that lost the race is disposed immediately. + private GrpcChannel GetOrCreateChannel(string endpoint) + { + if (!_channels.TryGetValue(endpoint, out var channel)) + { + var created = CreateChannel(endpoint); + channel = _channels.GetOrAdd(endpoint, created); + if (!ReferenceEquals(channel, created)) + { + created.Dispose(); + } + } + return channel; + } + + private GrpcChannel CreateChannel(string endpoint) => + GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions + { + HttpHandler = new SocketsHttpHandler + { + KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay, + KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout, + KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always, + }, + }); + + /// Disposes all cached channels. + public void Dispose() + { + foreach (var channel in _channels.Values) + { + channel.Dispose(); + } + _channels.Clear(); + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs new file mode 100644 index 00000000..350ee1ac --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/GrpcPullSiteCallsClient.cs @@ -0,0 +1,304 @@ +using System.Collections.Concurrent; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Grpc.Net.Client; +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Communication; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse; +using PullSiteCallsResponse = ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration.PullSiteCallsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production (Site Call Audit #22) that the +/// central reconciliation tick (a separate follow-up component) uses to pull the +/// next batch of cached-call operational rows from a site over the +/// PullSiteCalls unary gRPC RPC served by SiteStreamGrpcServer. +/// A near-exact sibling of . +/// +/// +/// +/// Endpoint resolution. The caller passes only a siteId; this +/// client resolves it to a gRPC authority via +/// () on every call so a NodeA→NodeB +/// failover flip or an edited site address takes effect on the next tick. A site +/// with no registered endpoint yields an empty response (no dial). +/// +/// +/// SourceSite re-stamp. The site leaves +/// SiteCallOperationalDto.SourceSite empty (the tracking store has no +/// site-id column). This client is the authority that knows which site it +/// dialed, so it re-stamps the mapped from +/// siteId — the same "re-stamp from the forwarder's own id" pattern the +/// site push path uses. +/// +/// +/// Fault tolerance. Per the contract, +/// tolerable transport faults (, +/// , , +/// bare / SocketException) are caught +/// and collapsed to an empty response so one offline site never sinks the rest +/// of the reconciliation tick. Any other transport/protocol fault is also +/// swallowed to empty: reconciliation is best-effort. Per-row DTO mapping faults +/// (e.g. a single unparseable TrackedOperationId) are narrower still — +/// the offending row is skipped+logged and the rest of the batch is returned. +/// +/// +/// Testability. The unary call is reached through the +/// seam. Production binds +/// (one cached +/// per endpoint, keepalive from ); unit tests +/// inject a fake invoker so no real HTTP/2 endpoint is required. +/// +/// +public sealed class GrpcPullSiteCallsClient : IPullSiteCallsClient +{ + private readonly ISiteEnumerator _sites; + private readonly IPullSiteCallsInvoker _invoker; + private readonly ILogger _logger; + + /// + /// Creates the client over the given site enumerator and unary-call invoker. + /// + /// Resolves a siteId to its gRPC endpoint. + /// Seam that issues the PullSiteCalls unary RPC against a resolved endpoint. + /// Logger for transport-fault diagnostics. + public GrpcPullSiteCallsClient( + ISiteEnumerator sites, + IPullSiteCallsInvoker invoker, + ILogger logger) + { + _sites = sites ?? throw new ArgumentNullException(nameof(sites)); + _invoker = invoker ?? throw new ArgumentNullException(nameof(invoker)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public async Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct) + { + var endpoint = await ResolveEndpointAsync(siteId, ct).ConfigureAwait(false); + if (endpoint is null) + { + // No gRPC address registered for the site — a configuration decision + // (mirrors ISiteEnumerator's own contract), not a runtime error, so + // there is simply nothing to pull. + _logger.LogDebug( + "PullSiteCalls skipped: no gRPC endpoint registered for site {SiteId}.", siteId); + return Empty; + } + + var request = new ProtoPullRequest + { + // ReadChangedSinceAsync treats DateTime.MinValue as "from the start"; + // EnsureUtc keeps Timestamp.FromDateTime happy (it requires UTC kind). + SinceUtc = Timestamp.FromDateTime(EnsureUtc(sinceUtc)), + BatchSize = batchSize, + }; + + ProtoPullResponse reply; + try + { + reply = await _invoker.InvokeAsync(endpoint, request, ct).ConfigureAwait(false); + } + catch (RpcException ex) when (IsTolerable(ex.StatusCode)) + { + _logger.LogDebug(ex, + "PullSiteCalls tolerable transport fault for site {SiteId} ({Endpoint}): {Status}. Returning empty batch.", + siteId, endpoint, ex.StatusCode); + return Empty; + } + catch (Exception ex) when (ex is HttpRequestException or System.Net.Sockets.SocketException) + { + _logger.LogDebug(ex, + "PullSiteCalls connection-layer fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + catch (OperationCanceledException) + { + // Reconciliation tick cancelled — caller token (host shutdown) or an + // internal gRPC deadline / linked-CTS cancellation. Both tolerable for + // a best-effort pull; collapse to empty rather than landing noisily in + // the catch-all below. + return Empty; + } + catch (Exception ex) + { + // Any other fault. Reconciliation is best-effort; swallow to empty + // rather than throw — the (future) actor's per-site guard would only + // re-catch it. + _logger.LogWarning(ex, + "PullSiteCalls unexpected fault for site {SiteId} ({Endpoint}). Returning empty batch.", + siteId, endpoint); + return Empty; + } + + // Map proto DTOs to central SiteCall entities PER-ROW so one malformed + // operational (e.g. an unparseable TrackedOperationId) is skipped+logged + // rather than sinking the whole batch through the outer catch-all. Each + // survivor is re-stamped with SourceSite from the dialed siteId (the site + // leaves it empty). + var siteCalls = new List(reply.Operationals.Count); + foreach (var dto in reply.Operationals) + { + try + { + var sc = SiteCallDtoMapper.FromDto(dto) with { SourceSite = siteId }; + siteCalls.Add(sc); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "PullSiteCalls dropped a malformed operational row from site {SiteId} (id='{Id}'); continuing with the rest of the batch.", + siteId, dto.TrackedOperationId); + } + } + + // Order oldest-first by UpdatedAtUtc (the wire is already ordered by the + // site read, but the contract is explicit, so sort defensively). + siteCalls.Sort((a, b) => a.UpdatedAtUtc.CompareTo(b.UpdatedAtUtc)); + + return new PullSiteCallsResponse(siteCalls, reply.MoreAvailable); + } + + private async Task ResolveEndpointAsync(string siteId, CancellationToken ct) + { + var sites = await _sites.EnumerateAsync(ct).ConfigureAwait(false); + foreach (var site in sites) + { + if (string.Equals(site.SiteId, siteId, StringComparison.Ordinal) && + !string.IsNullOrWhiteSpace(site.GrpcEndpoint)) + { + return site.GrpcEndpoint; + } + } + return null; + } + + private static readonly PullSiteCallsResponse Empty = + new(Array.Empty(), MoreAvailable: false); + + private static bool IsTolerable(StatusCode code) => code is + StatusCode.Unavailable or + StatusCode.DeadlineExceeded or + StatusCode.Cancelled; + + // All ScadaBridge timestamps are UTC by invariant. A non-UTC cursor (the + // reconciliation cursor starts at DateTime.MinValue, Kind=Unspecified) is + // treated AS UTC — never ToUniversalTime()-converted: on a host with a + // positive UTC offset MinValue.ToUniversalTime() underflows and + // Timestamp.FromDateTime throws, crashing the first pull for every site. + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); + + /// + /// Seam over the PullSiteCalls unary gRPC call against a resolved site + /// endpoint. Extracted so can be + /// unit-tested without a real . Production binds + /// . + /// + public interface IPullSiteCallsInvoker + { + /// + /// Issues the PullSiteCalls unary RPC against . + /// May throw / + /// on transport faults — the caller classifies and swallows tolerable ones. + /// + /// The site gRPC authority (e.g. http://site-a:8083). + /// The wire-format pull request. + /// Cancellation token. + /// The wire-format pull response. + Task InvokeAsync(string endpoint, ProtoPullRequest request, CancellationToken ct); + } +} + +/// +/// Production : caches +/// one per endpoint (keepalive from +/// , mirroring SiteStreamGrpcClient) and +/// issues the unary PullSiteCallsAsync call. The cache is keyed by +/// endpoint string, so a changed site address (NodeA→NodeB failover flip / an +/// edited gRPC address) is reached as soon as the resolver hands the new endpoint +/// to . The channel for a previous address lingers idle +/// until (idle channels hold no streams — a minor cache +/// footprint cost, not a correctness or liveness gap). Sibling of +/// . +/// +public sealed class GrpcPullSiteCallsInvoker + : GrpcPullSiteCallsClient.IPullSiteCallsInvoker, IDisposable +{ + private readonly ConcurrentDictionary _channels = new(StringComparer.Ordinal); + private readonly CommunicationOptions _options; + + /// Creates the invoker using default . + public GrpcPullSiteCallsInvoker() + : this(new CommunicationOptions()) + { + } + + /// + /// Creates the invoker, applying the configured gRPC keepalive settings to + /// every channel it opens. + /// + /// Communication options supplying gRPC keepalive timings. + public GrpcPullSiteCallsInvoker(CommunicationOptions options) + { + _options = options ?? throw new ArgumentNullException(nameof(options)); + } + + /// + public async Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + var channel = GetOrCreateChannel(endpoint); + var client = new SiteStreamService.SiteStreamServiceClient(channel); + using var call = client.PullSiteCallsAsync(request, cancellationToken: ct); + return await call.ResponseAsync.ConfigureAwait(false); + } + + // Race-safe channel cache (create-then-GetOrAdd-then-dispose-if-lost): two + // concurrent first dials of the same endpoint can both build a GrpcChannel; + // only the channel actually installed survives, the loser is disposed. + // Mirrors SiteStreamGrpcClientFactory / GrpcPullAuditEventsInvoker. + private GrpcChannel GetOrCreateChannel(string endpoint) + { + if (!_channels.TryGetValue(endpoint, out var channel)) + { + var created = CreateChannel(endpoint); + channel = _channels.GetOrAdd(endpoint, created); + if (!ReferenceEquals(channel, created)) + { + created.Dispose(); + } + } + return channel; + } + + private GrpcChannel CreateChannel(string endpoint) => + GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions + { + HttpHandler = new SocketsHttpHandler + { + KeepAlivePingDelay = _options.GrpcKeepAlivePingDelay, + KeepAlivePingTimeout = _options.GrpcKeepAlivePingTimeout, + KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always, + }, + }); + + /// Disposes all cached channels. + public void Dispose() + { + foreach (var channel in _channels.Values) + { + channel.Dispose(); + } + _channels.Clear(); + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs new file mode 100644 index 00000000..c22d5706 --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/IPullSiteCallsClient.cs @@ -0,0 +1,57 @@ +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Mockable abstraction over the central-side PullSiteCalls gRPC client +/// surface used by the Site Call Audit (#22) reconciliation tick to fetch the +/// next batch of cached-call operational rows from a specific site — the +/// documented periodic self-heal pull that backfills the eventually-consistent +/// central SiteCalls mirror when best-effort push telemetry is lost. +/// Extracted so the (separate, follow-up) reconciliation actor can be +/// unit-tested against an in-memory stub without standing up a real +/// GrpcChannel per site. +/// +/// +/// +/// The home is ZB.MOM.WW.ScadaBridge.AuditLog.Central rather than the +/// ZB.MOM.WW.ScadaBridge.SiteCallAudit project so it can reuse the +/// / endpoint-resolution +/// abstraction that already lives here (and that the sibling +/// uses) — SiteCallAudit does not reference +/// AuditLog, so hosting the client there would mean duplicating the enumerator. +/// This mirrors the decision to keep in +/// ZB.MOM.WW.ScadaBridge.Communication. +/// +/// +/// Implementations MUST NOT throw on transport faults the reconciliation tick +/// can tolerate (connection refused, deadline exceeded, cancellation) — one +/// offline site must never sink the rest of the tick. The +/// are returned oldest-first by +/// UpdatedAtUtc with the SourceSite re-stamped from the dialed +/// site id (the site leaves it empty, being unaware of its own id), and a +/// MoreAvailable flag the caller uses to decide whether to fire another +/// pull immediately. +/// +/// +public interface IPullSiteCallsClient +{ + /// + /// Issues a PullSiteCalls RPC against the site whose gRPC endpoint is + /// registered against . Returns the next batch of + /// rows + /// ordered oldest-first (with SourceSite re-stamped from + /// ) AND a MoreAvailable flag the caller uses + /// to decide whether to fire another pull immediately. + /// + /// The identifier of the site to pull cached-call operational rows from. + /// Only rows with an UpdatedAtUtc at or after this cursor time are returned. + /// Maximum number of rows to return per call. + /// Cancellation token. + /// A task that resolves to the next reconciliation batch with a MoreAvailable flag. + Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct); +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs index cc8cae1f..25a5a4c7 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/ISiteEnumerator.cs @@ -9,11 +9,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; /// /// The production implementation wraps ISiteRepository.GetAllSitesAsync /// and projects each Site to a using the -/// site's configured GrpcNodeAAddress (falling back to -/// GrpcNodeBAddress when NodeA is unset). Sites with NO gRPC address -/// configured are silently skipped — the reconciliation pull cannot reach -/// them, but absence of an address is a configuration decision, not a runtime -/// error. +/// site's configured GrpcNodeAAddress. This is a NodeA-only first cut: +/// sites with a blank GrpcNodeAAddress are silently SKIPPED — the +/// reconciliation pull cannot reach them, but absence of an address is a +/// configuration decision, not a runtime error. NodeB-fallback endpoint +/// selection (dial NodeB when NodeA is unset/unreachable) is a follow-up +/// (mirrors the comment in SiteEnumerator.cs). /// public interface ISiteEnumerator { diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs index fb08bc57..8c6a297b 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -182,6 +182,10 @@ public class SiteAuditReconciliationActor : ReceiveActor IReadOnlyList sites; try { + // No ambient CancellationToken in a ReceiveActor message handler — + // CancellationToken.None (the EnumerateAsync default) is intentional. + // The work is bounded by the 5-min reconciliation tick plus the + // 10s graceful-stop drain on PhaseClusterLeave. sites = await _sites.EnumerateAsync().ConfigureAwait(false); } catch (Exception ex) diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs index 31796ad9..b58b3fb4 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteAuditReconciliationOptions.cs @@ -31,18 +31,45 @@ public sealed class SiteAuditReconciliationOptions /// /// Test-only override for finer control over the tick cadence than /// whole-second resolution allows. When non-null, takes precedence over - /// . Not bound from config — - /// production config exposes - /// only. + /// AND bypasses the + /// minimum clamp (so tests can use + /// millisecond cadences). Production config exposes + /// only and never sets this + /// knob — but because the options class is Bind-ed wholesale, a + /// config value at AuditLog:Reconciliation:ReconciliationIntervalOverride + /// WOULD bind if present; operators must not set it. /// public TimeSpan? ReconciliationIntervalOverride { get; set; } /// - /// Resolves the effective tick interval, honouring the test override when - /// set. Falls back to . + /// Minimum interval the config-bound + /// can resolve to. Clamps a misconfigured ReconciliationIntervalSeconds: 0 + /// (or a negative value) away from , which would make + /// Akka's ScheduleTellRepeatedlyCancelable spin. The test-only + /// bypasses this clamp so unit tests + /// can still drop the cadence to milliseconds. /// - public TimeSpan ReconciliationInterval => - ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + private static readonly TimeSpan MinConfiguredInterval = TimeSpan.FromSeconds(1); + + /// + /// Resolves the effective tick interval, honouring the test override when + /// set. Falls back to , clamped to at + /// least so a zero/negative config value can + /// never yield (which would spin the scheduler). + /// + public TimeSpan ReconciliationInterval + { + get + { + if (ReconciliationIntervalOverride is { } overrideValue) + { + return overrideValue; + } + + var resolved = TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + return resolved < MinConfiguredInterval ? MinConfiguredInterval : resolved; + } + } /// /// Maximum number of diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs new file mode 100644 index 00000000..357ee4ee --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/Central/SiteEnumerator.cs @@ -0,0 +1,77 @@ +using Microsoft.Extensions.DependencyInjection; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central; + +/// +/// Production backing the central +/// . Enumerates the configured sites +/// from the config DB via and +/// projects each site to a using the site's +/// SiteIdentifier as the cursor key and its GrpcNodeAAddress as +/// the dial target. +/// +/// +/// +/// Scope-per-call. is a SCOPED EF Core +/// service (registered by AddConfigurationDatabase); resolving it from +/// the root provider would fail DI scope validation. The enumerator therefore +/// takes the root and opens one +/// CreateAsyncScope per call — mirroring the +/// per-tick scope pattern in . +/// +/// +/// Blank-address skip. Sites with no GrpcNodeAAddress configured +/// are silently skipped: the reconciliation pull cannot dial them, but absence +/// of an address is a configuration decision, not a runtime error (per the +/// contract). +/// +/// +/// NodeA-only first cut. This implementation always uses NodeA's gRPC +/// address. NodeA/NodeB failover endpoint selection (dial NodeB when NodeA is +/// unreachable) is a follow-up — the shape already +/// carries a single endpoint, so failover will live in the puller/client, not +/// here. +/// +/// +public sealed class SiteEnumerator : ISiteEnumerator +{ + private readonly IServiceProvider _services; + + /// + /// Initializes the enumerator with the root service provider used to open a + /// fresh DI scope per enumeration call. + /// + /// Root service provider for resolving the scoped . + public SiteEnumerator(IServiceProvider services) + { + ArgumentNullException.ThrowIfNull(services); + _services = services; + } + + /// + public async Task> EnumerateAsync(CancellationToken ct = default) + { + await using var scope = _services.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + + var sites = await repository.GetAllSitesAsync(ct).ConfigureAwait(false); + + var entries = new List(sites.Count); + foreach (var site in sites) + { + // First cut: NodeA's gRPC address is the dial target. NodeA/NodeB + // failover endpoint selection is a follow-up. + if (string.IsNullOrWhiteSpace(site.GrpcNodeAAddress)) + { + continue; + } + + // The IsNullOrWhiteSpace guard above proves GrpcNodeAAddress is + // non-null here; explicit null-forgiving for clarity. + entries.Add(new SiteEntry(site.SiteIdentifier, site.GrpcNodeAAddress!)); + } + + return entries; + } +} diff --git a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs index 6b1e0255..631200a1 100644 --- a/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.AuditLog/ServiceCollectionExtensions.cs @@ -50,6 +50,12 @@ public static class ServiceCollectionExtensions /// Configuration section bound to . public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance"; + /// Configuration section bound to . + public const string PurgeSectionName = "AuditLog:Purge"; + + /// Configuration section bound to . + public const string ReconciliationSectionName = "AuditLog:Reconciliation"; + /// /// Registers the Audit Log (#23) component services: options, the site /// SQLite writer chain (primary + ring fallback + failure-counter sink), @@ -327,6 +333,24 @@ public static class ServiceCollectionExtensions .Bind(config.GetSection(PartitionMaintenanceSectionName)); services.AddHostedService(); + // I1 (review): bind the two central-singleton options HERE rather than in + // AddAuditLogCentralReconciliationClient. AkkaHostedService.RegisterCentralActors + // resolves IOptions / + // via GetRequiredService when it wires the AuditLogPurgeActor + + // SiteAuditReconciliationActor singletons; AddAuditLogCentralMaintenance is + // ALWAYS called on the central path (the reconciliation-client helper is the + // one that could in principle be dropped), so binding the options here means + // the singletons get a valid IOptions even if the gRPC-client helper is not + // wired — instead of a cryptic InvalidOperationException at GetRequiredService. + // Defaults are fine when the section is absent (24 h purge cadence / + // 5 min reconciliation tick); production exposes IntervalHours / + // ReconciliationIntervalSeconds only — the test-only *Override knobs are + // not intended to be set from config (see the options classes' remarks). + services.AddOptions() + .Bind(config.GetSection(PurgeSectionName)); + services.AddOptions() + .Bind(config.GetSection(ReconciliationSectionName)); + // M6 Bundle E (T8 + T9): central health snapshot — a single object // that owns the CentralAuditWriteFailures + AuditRedactionFailure // Interlocked counters AND surfaces them on @@ -362,4 +386,118 @@ public static class ServiceCollectionExtensions return services; } + + /// + /// Audit Log (#23) M6 — central-only registration of the production + /// () + /// and its unary-call invoker () used + /// by to pull reconciliation + /// batches from each site over the PullAuditEvents gRPC RPC. + /// + /// + /// + /// Kept out of — which also runs on site + /// composition roots — because the client dials sites and resolves + /// (a central-only collaborator wired + /// alongside the reconciliation singleton). Folding it into + /// would register a site-dialing client on every + /// site host, violating the "every Add* call is safe from any + /// composition root" invariant. This helper is the central analogue of + /// . + /// + /// + /// The binds with default + /// + /// keepalive unless an IOptions<CommunicationOptions> is + /// already registered, in which case the configured timings flow through — + /// matching how SiteStreamGrpcClientFactory takes its keepalive from + /// the same options. + /// + /// + /// The production (, + /// wrapping the scoped ISiteRepository) IS registered here — so the + /// singleton wired in the Host can + /// resolve its enumerator + gRPC client from this central-only helper. Keeping + /// the enumerator on this central path preserves the "every Add* call is + /// safe from any composition root" invariant: a site host never calls this + /// helper, so it never registers a site-dialing enumerator. The + /// + + /// bindings live in instead (I1 + /// review fix) — that helper is unconditionally called on the central path, so + /// the two maintenance singletons get a valid IOptions even if this + /// gRPC-client helper is ever dropped. + /// + /// + /// The service collection to register into. + /// Application configuration used to bind the gRPC client's communication options (purge + reconciliation options are bound by ). + /// The same for chaining. + public static IServiceCollection AddAuditLogCentralReconciliationClient( + this IServiceCollection services, + IConfiguration config) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(config); + + // Production ISiteEnumerator: projects the config-DB Site rows into the + // reconciliation targets the SiteAuditReconciliationActor polls. Scoped + // ISiteRepository is resolved per call inside the enumerator, so the + // singleton takes the ROOT provider (mirrors the per-tick scope pattern + // in SiteAuditReconciliationActor / AuditLogPurgeActor). + services.TryAddSingleton(sp => new SiteEnumerator(sp)); + + // I1 (review): the AuditLogPurgeOptions / SiteAuditReconciliationOptions + // bindings moved to AddAuditLogCentralMaintenance — that helper is always + // called on the central path, so the two maintenance singletons resolve a + // valid IOptions even if this gRPC-client helper is ever dropped. Keep the + // ISiteEnumerator + gRPC client registrations here (they dial sites and are + // central-only by design). + + // The invoker owns the per-endpoint GrpcChannel cache, so it must be a + // singleton — a fresh invoker per resolution would leak channels. + // Resolve CommunicationOptions if present (the central Host binds it), + // otherwise fall back to defaults so this helper stays standalone. + services.TryAddSingleton(sp => + { + var options = sp + .GetService>(); + return options is null + ? new GrpcPullAuditEventsInvoker() + : new GrpcPullAuditEventsInvoker(options.Value); + }); + services.TryAddSingleton( + sp => sp.GetRequiredService()); + + services.TryAddSingleton(sp => new GrpcPullAuditEventsClient( + sp.GetRequiredService(), + sp.GetRequiredService(), + sp.GetRequiredService>())); + + // Site Call Audit (#22) reconciliation pull client — central-only, the + // sibling of the audit pull client above. Lives here (not in the + // SiteCallAudit project) so it can reuse the central-only + // ISiteEnumerator registered just above; SiteCallAudit does not + // reference AuditLog. The invoker owns the per-endpoint GrpcChannel + // cache, so it must be a singleton (a fresh invoker per resolution + // would leak channels). CommunicationOptions flow through when bound by + // the central Host, else defaults — mirrors the audit invoker. + services.TryAddSingleton(sp => + { + var options = sp + .GetService>(); + return options is null + ? new GrpcPullSiteCallsInvoker() + : new GrpcPullSiteCallsInvoker(options.Value); + }); + services.TryAddSingleton( + sp => sp.GetRequiredService()); + + services.TryAddSingleton(sp => new GrpcPullSiteCallsClient( + sp.GetRequiredService(), + sp.GetRequiredService(), + sp.GetRequiredService>())); + + return services; + } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs index b85bff5c..1d69023a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Interfaces/Services/IOperationTrackingStore.cs @@ -118,4 +118,40 @@ public interface IOperationTrackingStore Task PurgeTerminalAsync( DateTime olderThanUtc, CancellationToken ct = default); + + /// + /// Reconciliation read (Site Call Audit #22): return tracking rows whose + /// UpdatedAtUtc is at or after as + /// projections, ordered by + /// UpdatedAtUtc ascending and capped at . + /// This is the site-side feed for central's PullSiteCalls RPC — the + /// documented periodic self-heal pull that backfills the eventually-consistent + /// central SiteCalls mirror when best-effort push telemetry is lost. + /// + /// + /// + /// The lower bound is inclusive so a caller can resume from the last + /// returned UpdatedAtUtc without skipping a row that shares that + /// instant; central ingest is insert-if-not-exists then upsert-on-newer, so + /// re-reading the boundary row is a harmless no-op. The oldest-first cap lets + /// the caller advance the cursor monotonically across follow-up pulls. + /// + /// + /// is left as the empty string: + /// the site id is not a tracking-store column, and the central client re-stamps + /// it from the siteId it dialed (the only authority that knows which + /// site the rows came from). is + /// projected from the row's Kind (DbWriteCached → DbOutbound, + /// otherwise ApiOutbound) and + /// from TargetSummary. + /// + /// + /// Inclusive lower bound on UpdatedAtUtc; reads from the start. + /// Maximum number of rows to return (oldest first). + /// Cancellation token. + /// The matching rows projected to , oldest-first, capped at . + Task> ReadChangedSinceAsync( + DateTime sinceUtc, + int batchSize, + CancellationToken ct = default); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs new file mode 100644 index 00000000..fa3949cb --- /dev/null +++ b/src/ZB.MOM.WW.ScadaBridge.Commons/Messages/Integration/PullSiteCallsResponse.cs @@ -0,0 +1,17 @@ +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; + +namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; + +/// +/// Site Call Audit (#22) periodic reconciliation pull response: the next batch of +/// site cached-call operational rows (the eventually-consistent SiteCalls +/// mirror's self-heal feed) plus a flag signalling +/// the caller to advance the watermark and pull again. Mirrors +/// ; carries the central +/// entity the ingest path upserts. See Component-SiteCallAudit.md. +/// +/// The next batch of operational rows, ordered oldest-first by . +/// True when the site saturated the requested batch size — the caller should advance the cursor and pull again. +public sealed record PullSiteCallsResponse( + IReadOnlyList SiteCalls, + bool MoreAvailable); diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs index ec7a0dbd..265a37eb 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteCallDtoMapper.cs @@ -1,5 +1,6 @@ using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Types; +using Timestamp = Google.Protobuf.WellKnownTypes.Timestamp; namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc; @@ -20,10 +21,15 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc; /// Mirrors the sibling . /// /// -/// Only the DTO→entity direction is provided: nothing in the system maps a -/// back onto the wire (sites emit the operational state -/// from SiteCallOperational, never from the central -/// entity), so an entity→DTO method would be dead code. +/// Two directions are provided. rehydrates the central +/// entity central writes into the SiteCalls table. +/// projects a site-local +/// onto the wire — used by the Site Call Audit (#22) PullSiteCalls +/// reconciliation handler (the central→site self-heal pull). The +/// entity itself is never mapped back onto the wire: +/// sites emit operational state from , never +/// from the central , so a SiteCall→DTO method +/// would be dead code. /// /// /// String nullability convention: proto3 scalar strings cannot be absent, so the @@ -70,4 +76,54 @@ public static class SiteCallDtoMapper IngestedAtUtc = DateTime.UtcNow, // overwritten by AuditLogIngestActor }; } + + /// + /// Projects a site-local onto its + /// wire-format DTO for the Site Call Audit (#22) PullSiteCalls + /// reconciliation RPC. The inverse of ; null + /// / + /// collapse to empty strings (proto3 scalar strings cannot be absent), while + /// the nullable HttpStatus and TerminalAtUtc stay unset on the + /// wire so true-null semantics survive the round-trip back through + /// . + /// + /// The site-local operational state to project to wire format. + /// A populated ready for transmission. + public static SiteCallOperationalDto ToDto(SiteCallOperational operational) + { + ArgumentNullException.ThrowIfNull(operational); + + var dto = new SiteCallOperationalDto + { + TrackedOperationId = operational.TrackedOperationId.ToString(), + Channel = operational.Channel, + Target = operational.Target, + SourceSite = operational.SourceSite, + SourceNode = operational.SourceNode ?? string.Empty, + Status = operational.Status, + RetryCount = operational.RetryCount, + LastError = operational.LastError ?? string.Empty, + CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.CreatedAtUtc)), + UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.UpdatedAtUtc)), + }; + + if (operational.HttpStatus.HasValue) + { + dto.HttpStatus = operational.HttpStatus.Value; + } + + if (operational.TerminalAtUtc.HasValue) + { + dto.TerminalAtUtc = Timestamp.FromDateTime(EnsureUtc(operational.TerminalAtUtc.Value)); + } + + return dto; + } + + // All ScadaBridge timestamps are UTC by invariant; Timestamp.FromDateTime + // requires UTC kind. Specify (never convert) so a row read back from SQLite + // with Kind=Utc passes through and a defensively-unspecified value is + // treated as the UTC it already is. Mirrors AuditEventDtoMapper.EnsureUtc. + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc ? value : DateTime.SpecifyKind(value, DateTimeKind.Utc); } diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs index 7cb82444..7aedd140 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Grpc/SiteStreamGrpcServer.cs @@ -5,7 +5,9 @@ using Grpc.Core; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ZB.MOM.WW.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; +using ZB.MOM.WW.ScadaBridge.Commons.Types; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Observability; using GrpcStatus = Grpc.Core.Status; @@ -48,6 +50,14 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // the missing queue as "nothing to ship" and returns an empty response so // central retries on its next reconciliation cycle. private ISiteAuditQueue? _siteAuditQueue; + // Site Call Audit (#22): site-local operation-tracking store handed in by + // AkkaHostedService on site roles so the central reconciliation puller's + // PullSiteCalls RPC can read tracking rows changed since a cursor. Null + // when not wired (central-only host or test composing the server in + // isolation) — the handler treats the missing store as "nothing to ship" + // and returns an empty response so central retries on its next cycle. + // Mirrors _siteAuditQueue. + private IOperationTrackingStore? _operationTrackingStore; /// /// Test-only constructor — kept internal so the DI container sees a @@ -137,6 +147,21 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase _siteAuditQueue = queue; } + /// + /// Hands the site-local (the same + /// OperationTrackingStore singleton that backs + /// Tracking.Status(id) on the script thread) to the gRPC server so + /// the Site Call Audit (#22) RPC can serve + /// central's reconciliation pulls. Mirrors : + /// wired post-construction because the store and the gRPC server are both + /// DI singletons brought up in independent orders on site startup. + /// + /// The site operation-tracking store for serving reconciliation pulls. + public void SetOperationTrackingStore(IOperationTrackingStore store) + { + _operationTrackingStore = store; + } + /// /// Host-017 / REQ-HOST-7: signals the gRPC server to begin its part of the /// site shutdown sequence — refuse new @@ -432,7 +457,9 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // sinceUtc defaults to DateTime.MinValue when the wrapper is absent — // i.e. "pull from the beginning of recorded history", which is the // intended behaviour for the very first reconciliation cycle. - var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + var since = request.SinceUtc is not null + ? DateTime.SpecifyKind(request.SinceUtc.ToDateTime(), DateTimeKind.Utc) + : DateTime.MinValue; IReadOnlyList events; try @@ -488,6 +515,69 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase return response; } + /// + public override async Task PullSiteCalls( + PullSiteCallsRequest request, + ServerCallContext context) + { + var store = _operationTrackingStore; + if (store is null) + { + _logger.LogWarning( + "PullSiteCalls invoked before SetOperationTrackingStore was called; returning empty response."); + return new PullSiteCallsResponse(); + } + + if (request.BatchSize <= 0) + { + // Mirrors PullAuditEvents: reject malformed requests cleanly with + // InvalidArgument so the caller doesn't see a generic RpcException + // from the underlying SQLite parameter validation. + throw new RpcException(new GrpcStatus( + StatusCode.InvalidArgument, "batch_size must be > 0")); + } + + // since_utc defaults to DateTime.MinValue when the wrapper is absent — + // i.e. "pull from the beginning of recorded history", the intended + // behaviour for the very first reconciliation cycle. + var since = request.SinceUtc is not null + ? DateTime.SpecifyKind(request.SinceUtc.ToDateTime(), DateTimeKind.Utc) + : DateTime.MinValue; + + IReadOnlyList operationals; + try + { + operationals = await store.ReadChangedSinceAsync( + since, request.BatchSize, context.CancellationToken); + } + catch (Exception ex) + { + // Best-effort, like PullAuditEvents: a read fault must never abort + // the reconciliation tick — central retries on its next cycle. + _logger.LogError(ex, + "ReadChangedSinceAsync failed for since={Since} batch={Batch}; returning empty response.", + since, request.BatchSize); + return new PullSiteCallsResponse(); + } + + var response = new PullSiteCallsResponse + { + // batch_size saturated → tell central to issue a follow-up pull with + // an advanced cursor. The site doesn't compute the cursor — central + // walks it forward from the last returned UpdatedAtUtc. Unlike + // PullAuditEvents there is no MarkReconciled step: the tracking store + // is the operational source of truth and the central SiteCalls mirror + // is upsert-on-newer, so re-reading rows is a harmless no-op. + MoreAvailable = operationals.Count >= request.BatchSize, + }; + foreach (var op in operationals) + { + response.Operationals.Add(SiteCallDtoMapper.ToDto(op)); + } + + return response; + } + /// /// Tracks a single active stream so cleanup only removes its own entry. /// diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto b/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto index 6beae55c..df9ee7af 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/Protos/sitestream.proto @@ -10,6 +10,7 @@ service SiteStreamService { rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck); rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck); rpc PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse); + rpc PullSiteCalls(PullSiteCallsRequest) returns (PullSiteCallsResponse); } message InstanceStreamRequest { @@ -157,3 +158,20 @@ message PullAuditEventsResponse { repeated AuditEventDto events = 1; bool more_available = 2; } + +// Site Call Audit (#22) reconciliation pull: central→site request for any +// site-local operation-tracking rows whose UpdatedAtUtc >= since_utc — the +// self-heal feed that backfills the eventually-consistent central SiteCalls +// mirror when best-effort push telemetry is lost. Mirrors PullAuditEvents +// but is a SEPARATE RPC (the tracking store is the operational source of +// truth, distinct from the site audit queue). more_available signals +// batch_size was saturated so the caller advances since_utc and pulls again. +message PullSiteCallsRequest { + google.protobuf.Timestamp since_utc = 1; + int32 batch_size = 2; +} + +message PullSiteCallsResponse { + repeated SiteCallOperationalDto operationals = 1; + bool more_available = 2; +} diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs index cebfccab..a0e79003 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/Sitestream.cs @@ -81,23 +81,30 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { "dWVzdBItCglzaW5jZV91dGMYASABKAsyGi5nb29nbGUucHJvdG9idWYuVGlt", "ZXN0YW1wEhIKCmJhdGNoX3NpemUYAiABKAUiXAoXUHVsbEF1ZGl0RXZlbnRz", "UmVzcG9uc2USKQoGZXZlbnRzGAEgAygLMhkuc2l0ZXN0cmVhbS5BdWRpdEV2", - "ZW50RHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIgASgIKlwKB1F1YWxpdHkSFwoT", - "UVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFVQUxJVFlfR09PRBABEhUKEVFV", - "QUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElUWV9CQUQQAypdCg5BbGFybVN0", - "YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQRUNJRklFRBAAEhYKEkFMQVJN", - "X1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NUQVRFX0FDVElWRRACKoUBCg5B", - "bGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZFTF9OT05FEAASEwoPQUxBUk1f", - "TEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxfTE9XX0xPVxACEhQKEEFMQVJN", - "X0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZFTF9ISUdIX0hJR0gQBDLhAgoR", - "U2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2NyaWJlSW5zdGFuY2USIS5zaXRl", - "c3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVzdBobLnNpdGVzdHJlYW0uU2l0", - "ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVkaXRFdmVudHMSGy5zaXRlc3Ry", - "ZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVzdHJlYW0uSW5nZXN0QWNrElAK", - "FUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNpdGVzdHJlYW0uQ2FjaGVkVGVs", - "ZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLkluZ2VzdEFjaxJaCg9QdWxsQXVk", - "aXRFdmVudHMSIi5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50c1JlcXVlc3Qa", - "Iy5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50c1Jlc3BvbnNlQiuqAihaQi5N", - "T00uV1cuU2NhZGFCcmlkZ2UuQ29tbXVuaWNhdGlvbi5HcnBjYgZwcm90bzM=")); + "ZW50RHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIgASgIIlkKFFB1bGxTaXRlQ2Fs", + "bHNSZXF1ZXN0Ei0KCXNpbmNlX3V0YxgBIAEoCzIaLmdvb2dsZS5wcm90b2J1", + "Zi5UaW1lc3RhbXASEgoKYmF0Y2hfc2l6ZRgCIAEoBSJpChVQdWxsU2l0ZUNh", + "bGxzUmVzcG9uc2USOAoMb3BlcmF0aW9uYWxzGAEgAygLMiIuc2l0ZXN0cmVh", + "bS5TaXRlQ2FsbE9wZXJhdGlvbmFsRHRvEhYKDm1vcmVfYXZhaWxhYmxlGAIg", + "ASgIKlwKB1F1YWxpdHkSFwoTUVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFV", + "QUxJVFlfR09PRBABEhUKEVFVQUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElU", + "WV9CQUQQAypdCg5BbGFybVN0YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQ", + "RUNJRklFRBAAEhYKEkFMQVJNX1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NU", + "QVRFX0FDVElWRRACKoUBCg5BbGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZF", + "TF9OT05FEAASEwoPQUxBUk1fTEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxf", + "TE9XX0xPVxACEhQKEEFMQVJNX0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZF", + "TF9ISUdIX0hJR0gQBDK3AwoRU2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2Ny", + "aWJlSW5zdGFuY2USIS5zaXRlc3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVz", + "dBobLnNpdGVzdHJlYW0uU2l0ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVk", + "aXRFdmVudHMSGy5zaXRlc3RyZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVz", + "dHJlYW0uSW5nZXN0QWNrElAKFUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNp", + "dGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLklu", + "Z2VzdEFjaxJaCg9QdWxsQXVkaXRFdmVudHMSIi5zaXRlc3RyZWFtLlB1bGxB", + "dWRpdEV2ZW50c1JlcXVlc3QaIy5zaXRlc3RyZWFtLlB1bGxBdWRpdEV2ZW50", + "c1Jlc3BvbnNlElQKDVB1bGxTaXRlQ2FsbHMSIC5zaXRlc3RyZWFtLlB1bGxT", + "aXRlQ2FsbHNSZXF1ZXN0GiEuc2l0ZXN0cmVhbS5QdWxsU2l0ZUNhbGxzUmVz", + "cG9uc2VCK6oCKFpCLk1PTS5XVy5TY2FkYUJyaWRnZS5Db21tdW5pY2F0aW9u", + "LkdycGNiBnByb3RvMw==")); descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, new pbr::FileDescriptor[] { global::Google.Protobuf.WellKnownTypes.TimestampReflection.Descriptor, global::Google.Protobuf.WellKnownTypes.WrappersReflection.Descriptor, }, new pbr::GeneratedClrTypeInfo(new[] {typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.Quality), typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.AlarmStateEnum), typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.AlarmLevelEnum), }, null, new pbr::GeneratedClrTypeInfo[] { @@ -112,7 +119,9 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryPacket), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryPacket.Parser, new[]{ "AuditEvent", "Operational" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryBatch), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null) + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse), global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse.Parser, new[]{ "Operationals", "MoreAvailable" }, null, null, null, null) })); } #endregion @@ -5064,6 +5073,483 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { } + /// + /// Site Call Audit (#22) reconciliation pull: central→site request for any + /// site-local operation-tracking rows whose UpdatedAtUtc >= since_utc — the + /// self-heal feed that backfills the eventually-consistent central SiteCalls + /// mirror when best-effort push telemetry is lost. Mirrors PullAuditEvents + /// but is a SEPARATE RPC (the tracking store is the operational source of + /// truth, distinct from the site audit queue). more_available signals + /// batch_size was saturated so the caller advances since_utc and pulls again. + /// + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullSiteCallsRequest : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullSiteCallsRequest()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[12]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest(PullSiteCallsRequest other) : this() { + sinceUtc_ = other.sinceUtc_ != null ? other.sinceUtc_.Clone() : null; + batchSize_ = other.batchSize_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsRequest Clone() { + return new PullSiteCallsRequest(this); + } + + /// Field number for the "since_utc" field. + public const int SinceUtcFieldNumber = 1; + private global::Google.Protobuf.WellKnownTypes.Timestamp sinceUtc_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public global::Google.Protobuf.WellKnownTypes.Timestamp SinceUtc { + get { return sinceUtc_; } + set { + sinceUtc_ = value; + } + } + + /// Field number for the "batch_size" field. + public const int BatchSizeFieldNumber = 2; + private int batchSize_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int BatchSize { + get { return batchSize_; } + set { + batchSize_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullSiteCallsRequest); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullSiteCallsRequest other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if (!object.Equals(SinceUtc, other.SinceUtc)) return false; + if (BatchSize != other.BatchSize) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + if (sinceUtc_ != null) hash ^= SinceUtc.GetHashCode(); + if (BatchSize != 0) hash ^= BatchSize.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + if (sinceUtc_ != null) { + size += 1 + pb::CodedOutputStream.ComputeMessageSize(SinceUtc); + } + if (BatchSize != 0) { + size += 1 + pb::CodedOutputStream.ComputeInt32Size(BatchSize); + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullSiteCallsRequest other) { + if (other == null) { + return; + } + if (other.sinceUtc_ != null) { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + SinceUtc.MergeFrom(other.SinceUtc); + } + if (other.BatchSize != 0) { + BatchSize = other.BatchSize; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + } + #endif + + } + + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullSiteCallsResponse : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullSiteCallsResponse()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[13]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse(PullSiteCallsResponse other) : this() { + operationals_ = other.operationals_.Clone(); + moreAvailable_ = other.moreAvailable_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullSiteCallsResponse Clone() { + return new PullSiteCallsResponse(this); + } + + /// Field number for the "operationals" field. + public const int OperationalsFieldNumber = 1; + private static readonly pb::FieldCodec _repeated_operationals_codec + = pb::FieldCodec.ForMessage(10, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteCallOperationalDto.Parser); + private readonly pbc::RepeatedField operationals_ = new pbc::RepeatedField(); + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public pbc::RepeatedField Operationals { + get { return operationals_; } + } + + /// Field number for the "more_available" field. + public const int MoreAvailableFieldNumber = 2; + private bool moreAvailable_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool MoreAvailable { + get { return moreAvailable_; } + set { + moreAvailable_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullSiteCallsResponse); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullSiteCallsResponse other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if(!operationals_.Equals(other.operationals_)) return false; + if (MoreAvailable != other.MoreAvailable) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + hash ^= operationals_.GetHashCode(); + if (MoreAvailable != false) hash ^= MoreAvailable.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + operationals_.WriteTo(output, _repeated_operationals_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + operationals_.WriteTo(ref output, _repeated_operationals_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + size += operationals_.CalculateSize(_repeated_operationals_codec); + if (MoreAvailable != false) { + size += 1 + 1; + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullSiteCallsResponse other) { + if (other == null) { + return; + } + operationals_.Add(other.operationals_); + if (other.MoreAvailable != false) { + MoreAvailable = other.MoreAvailable; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + operationals_.AddEntriesFrom(input, _repeated_operationals_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + operationals_.AddEntriesFrom(ref input, _repeated_operationals_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + } + #endif + + } + #endregion } diff --git a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs index 8993b16a..b57de38e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Communication/SiteStreamGrpc/SitestreamGrpc.cs @@ -59,6 +59,10 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullSiteCallsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullSiteCallsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Method __Method_SubscribeInstance = new grpc::Method( @@ -92,6 +96,14 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { __Marshaller_sitestream_PullAuditEventsRequest, __Marshaller_sitestream_PullAuditEventsResponse); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Method __Method_PullSiteCalls = new grpc::Method( + grpc::MethodType.Unary, + __ServiceName, + "PullSiteCalls", + __Marshaller_sitestream_PullSiteCallsRequest, + __Marshaller_sitestream_PullSiteCallsResponse); + /// Service descriptor public static global::Google.Protobuf.Reflection.ServiceDescriptor Descriptor { @@ -126,6 +138,12 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::System.Threading.Tasks.Task PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::ServerCallContext context) + { + throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); + } + } /// Client for SiteStreamService @@ -225,6 +243,26 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { { return CallInvoker.AsyncUnaryCall(__Method_PullAuditEvents, null, options, request); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullSiteCalls(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse PullSiteCalls(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::CallOptions options) + { + return CallInvoker.BlockingUnaryCall(__Method_PullSiteCalls, null, options, request); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullSiteCallsAsync(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullSiteCallsAsync(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullSiteCallsAsync(global::ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest request, grpc::CallOptions options) + { + return CallInvoker.AsyncUnaryCall(__Method_PullSiteCalls, null, options, request); + } /// Creates a new instance of client from given ClientBaseConfiguration. [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] protected override SiteStreamServiceClient NewInstance(ClientBaseConfiguration configuration) @@ -242,7 +280,8 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { .AddMethod(__Method_SubscribeInstance, serviceImpl.SubscribeInstance) .AddMethod(__Method_IngestAuditEvents, serviceImpl.IngestAuditEvents) .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry) - .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents).Build(); + .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents) + .AddMethod(__Method_PullSiteCalls, serviceImpl.PullSiteCalls).Build(); } /// Register service method with a service binder with or without implementation. Useful when customizing the service binding logic. @@ -256,6 +295,7 @@ namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc { serviceBinder.AddMethod(__Method_IngestAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestAuditEvents)); serviceBinder.AddMethod(__Method_IngestCachedTelemetry, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestCachedTelemetry)); serviceBinder.AddMethod(__Method_PullAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullAuditEvents)); + serviceBinder.AddMethod(__Method_PullSiteCalls, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullSiteCalls)); } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs index ab93ce56..2c45478f 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs @@ -588,6 +588,117 @@ akka {{ _logger.LogInformation( "SiteCallAuditActor singleton created and registered with CentralCommunicationActor"); + // Audit Log (#23) M6 Bundle B/C — start the two central-only maintenance + // singletons that were fully implemented but never instantiated: the + // daily AuditLog partition-switch purge (AuditLogPurgeActor) and the + // periodic per-site audit-event reconciliation pull + // (SiteAuditReconciliationActor). Both mirror the SiteCallAudit / + // NotificationOutbox singleton pattern above: a ClusterSingletonManager + // pins the actor to the active central node, a ClusterSingletonProxy + // gives a stable address, and a PhaseClusterLeave graceful-stop task + // drains the in-flight tick before handover. Options + the production + // ISiteEnumerator + IPullAuditEventsClient come from + // AddAuditLogCentralReconciliationClient (central composition root only). + // Both actors take the root IServiceProvider and open their own per-tick + // DI scope because IAuditLogRepository / ISiteRepository are scoped EF + // Core services. + var auditPurgeLogger = _serviceProvider.GetRequiredService() + .CreateLogger(); + var auditPurgeOptions = _serviceProvider + .GetRequiredService>(); + var auditLogOptions = _serviceProvider + .GetRequiredService>(); + + var auditPurgeSingletonProps = ClusterSingletonManager.Props( + singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.AuditLogPurgeActor( + _serviceProvider, + auditPurgeOptions, + auditLogOptions, + auditPurgeLogger)), + terminationMessage: PoisonPill.Instance, + settings: ClusterSingletonManagerSettings.Create(_actorSystem!) + .WithSingletonName("audit-log-purge")); + var auditPurgeSingletonManager = + _actorSystem!.ActorOf(auditPurgeSingletonProps, "audit-log-purge-singleton"); + + var auditPurgeShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem); + auditPurgeShutdown.AddTask( + Akka.Actor.CoordinatedShutdown.PhaseClusterLeave, + "drain-audit-log-purge-singleton", + async () => + { + try + { + await auditPurgeSingletonManager.GracefulStop(TimeSpan.FromSeconds(10)); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "AuditLogPurge singleton did not drain within the graceful-stop " + + "timeout; falling through to PoisonPill handover"); + } + return Akka.Done.Instance; + }); + + var auditPurgeProxyProps = ClusterSingletonProxy.Props( + singletonManagerPath: "/user/audit-log-purge-singleton", + settings: ClusterSingletonProxySettings.Create(_actorSystem) + .WithSingletonName("audit-log-purge")); + _actorSystem.ActorOf(auditPurgeProxyProps, "audit-log-purge-proxy"); + _logger.LogInformation("AuditLogPurgeActor singleton created"); + + // SiteAuditReconciliationActor — self-healing fallback puller. Resolves + // its production ISiteEnumerator (config-DB Site projection) and + // IPullAuditEventsClient (gRPC) from the central reconciliation-client + // helper registered in Program.cs. + var auditReconLogger = _serviceProvider.GetRequiredService() + .CreateLogger(); + var auditReconOptions = _serviceProvider + .GetRequiredService>(); + var auditReconSites = _serviceProvider + .GetRequiredService(); + var auditReconClient = _serviceProvider + .GetRequiredService(); + + var auditReconSingletonProps = ClusterSingletonManager.Props( + singletonProps: Props.Create(() => new ZB.MOM.WW.ScadaBridge.AuditLog.Central.SiteAuditReconciliationActor( + auditReconSites, + auditReconClient, + _serviceProvider, + auditReconOptions, + auditReconLogger)), + terminationMessage: PoisonPill.Instance, + settings: ClusterSingletonManagerSettings.Create(_actorSystem!) + .WithSingletonName("site-audit-reconciliation")); + var auditReconSingletonManager = + _actorSystem!.ActorOf(auditReconSingletonProps, "site-audit-reconciliation-singleton"); + + var auditReconShutdown = Akka.Actor.CoordinatedShutdown.Get(_actorSystem); + auditReconShutdown.AddTask( + Akka.Actor.CoordinatedShutdown.PhaseClusterLeave, + "drain-site-audit-reconciliation-singleton", + async () => + { + try + { + await auditReconSingletonManager.GracefulStop(TimeSpan.FromSeconds(10)); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "SiteAuditReconciliation singleton did not drain within the graceful-stop " + + "timeout; falling through to PoisonPill handover"); + } + return Akka.Done.Instance; + }); + + var auditReconProxyProps = ClusterSingletonProxy.Props( + singletonManagerPath: "/user/site-audit-reconciliation-singleton", + settings: ClusterSingletonProxySettings.Create(_actorSystem) + .WithSingletonName("site-audit-reconciliation")); + _actorSystem.ActorOf(auditReconProxyProps, "site-audit-reconciliation-proxy"); + _logger.LogInformation("SiteAuditReconciliationActor singleton created"); + _logger.LogInformation("Central actors registered. CentralCommunicationActor created."); } @@ -898,6 +1009,18 @@ akka {{ // direction one-way (Host knows both; Communication doesn't reach back // into AuditLog). grpcServer?.SetSiteAuditQueue(siteAuditQueue); + // Site Call Audit (#22): hand the site-local OperationTrackingStore to + // the gRPC server so the PullSiteCalls reconciliation RPC can serve + // central's self-heal pulls. siteTrackingStore is resolved above with + // GetService — present on site composition roots, null on central — so + // wire the seam only when the store exists. Like SetSiteAuditQueue, both + // the store and the gRPC server are singletons; wiring here keeps the + // dependency direction one-way (Host knows both; Communication doesn't + // reach back into SiteRuntime). + if (siteTrackingStore is not null) + { + grpcServer?.SetOperationTrackingStore(siteTrackingStore); + } grpcServer?.SetReady(_actorSystem!); } } diff --git a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs index 8d3f36aa..5af0f3b5 100644 --- a/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs +++ b/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs @@ -97,6 +97,13 @@ try // pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance // (registered below by AddConfigurationDatabase). builder.Services.AddAuditLogCentralMaintenance(builder.Configuration); + // #23 M6 Bundle B/C — central-only registration backing the two + // maintenance singletons started in AkkaHostedService: the production + // ISiteEnumerator + IPullAuditEventsClient (gRPC) used by the + // SiteAuditReconciliationActor, plus the AuditLogPurgeOptions / + // SiteAuditReconciliationOptions bindings consumed by both singletons. + // Central-only by design (it dials sites), kept out of AddAuditLog. + builder.Services.AddAuditLogCentralReconciliationClient(builder.Configuration); // Site Call Audit (#22) — central node owns the SiteCallAuditActor // singleton (M3 Bundle F). The extension itself currently registers // nothing — actor Props are constructed inline in AkkaHostedService — diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs index 320a7227..b89ae01a 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs @@ -1,6 +1,7 @@ using Akka.Actor; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit; @@ -24,13 +25,17 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// /// /// Implemented: direct telemetry ingest, -/// query, detail and KPI handlers (Task 4), and the central→site Retry/Discard -/// relay (Task 5 — the relay handlers live in this actor). Deferred (per -/// CLAUDE.md scope discipline — both land in a later follow-up): the periodic -/// per-site reconciliation puller that backfills lost telemetry, and the daily -/// terminal-row purge scheduler (the repository exposes -/// PurgeTerminalAsync but nothing in this module currently invokes it -/// on a schedule). +/// query, detail and KPI handlers (Task 4), the central→site Retry/Discard +/// relay (Task 5 — the relay handlers live in this actor), the periodic +/// per-site reconciliation puller that backfills lost telemetry (Piece A — +/// , the documented self-heal pull), and +/// the daily terminal-row purge scheduler (Piece B — +/// , which invokes +/// on a timer). Both +/// background timers are started in and gate on the +/// reconciliation collaborators ( + +/// ) being available — the repo-only test ctor +/// injects neither, so neither timer runs there. /// /// /// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" — @@ -68,6 +73,36 @@ public class SiteCallAuditActor : ReceiveActor private readonly SiteCallAuditOptions _options; private readonly ILogger _logger; + /// + /// Reconciliation collaborators (Piece A). The per-site self-heal pull + /// () and the site list + /// (). On the production path these are + /// resolved once from the root (central + /// singletons registered by AddAuditLogCentralReconciliationClient); + /// in the test path they are injected directly. They are null when + /// the actor was built via the repo-only test ctor — in that case the + /// reconciliation tick is NOT started (see ); + /// the purge tick gates on the same collaborators (see ). + /// + private readonly IPullSiteCallsClient? _pullClient; + private readonly ISiteEnumerator? _siteEnumerator; + + /// + /// Per-site reconciliation watermark — the highest + /// seen for that site on a previous + /// tick. The next tick asks for rows at or after this cursor; idempotent + /// monotonic swallows any + /// duplicate-with-same-timestamp rows. In-memory for the singleton's + /// lifetime — a failover / restart resets every cursor to + /// , which is conservative but correct + /// (the next tick re-pulls and idempotent upsert dedupes). Mirrors + /// SiteAuditReconciliationActor. + /// + private readonly Dictionary _reconciliationCursors = new(); + + private ICancelable? _reconciliationTimer; + private ICancelable? _purgeTimer; + /// /// Task 5 (#22): the central→site command transport — the /// CentralCommunicationActor, which owns the per-site @@ -87,6 +122,11 @@ public class SiteCallAuditActor : ReceiveActor /// across every message. Used by Bundle C's MSSQL-backed TestKit fixture. /// An optional lets a test pin the stuck/KPI /// windows; when omitted the production defaults apply. + /// + /// This ctor injects NO reconciliation client/enumerator, so the + /// reconciliation tick is gated off (see ) + /// — the MSSQL-backed read/upsert tests must not fire phantom pulls. + /// /// /// Concrete repository instance to use for all messages. /// Logger for diagnostics and error reporting. @@ -106,6 +146,49 @@ public class SiteCallAuditActor : ReceiveActor RegisterHandlers(); } + /// + /// Test-mode constructor for the reconciliation tick (Piece A) — injects a + /// concrete repository PLUS the two reconciliation collaborators directly, + /// so the per-site self-heal pull is unit-testable in-memory without a DI + /// container or a live gRPC channel. Because the client + enumerator are + /// present, the reconciliation tick IS started; the purge tick is also + /// started (both gate on the collaborators being available — see + /// / ). + /// + /// Concrete repository instance used for upserts and purges. + /// Enumerates the sites to reconcile each tick. + /// Pull client used to fetch changed rows from each site. + /// Logger for diagnostics and error reporting. + /// Optional configuration overrides; production defaults apply when null. + /// + /// Public (not internal) because Akka's default ActivatorProducer + /// instantiates the actor via reflection with public-only binding flags — + /// an internal ctor yields a MissingMethodException at actor + /// creation. Distinguished from the production + /// ctor by its concrete-collaborator parameter list; only the test project + /// (or a host that hand-resolves the collaborators) constructs it this way. + /// + public SiteCallAuditActor( + ISiteCallAuditRepository repository, + ISiteEnumerator siteEnumerator, + IPullSiteCallsClient pullClient, + ILogger logger, + SiteCallAuditOptions? options = null) + { + ArgumentNullException.ThrowIfNull(repository); + ArgumentNullException.ThrowIfNull(siteEnumerator); + ArgumentNullException.ThrowIfNull(pullClient); + ArgumentNullException.ThrowIfNull(logger); + + _injectedRepository = repository; + _siteEnumerator = siteEnumerator; + _pullClient = pullClient; + _logger = logger; + _options = options ?? new SiteCallAuditOptions(); + + RegisterHandlers(); + } + /// /// Production constructor — resolves /// from a fresh DI scope per message because the repository is a scoped EF @@ -129,6 +212,17 @@ public class SiteCallAuditActor : ReceiveActor _options = options; _logger = logger; + // Reconciliation collaborators (Piece A) are central-only singletons + // registered by AddAuditLogCentralReconciliationClient — always on the + // central composition root (Program.cs). Resolve them once here (the + // actor itself is a long-lived singleton; the repository is the only + // scoped service and is still resolved per-tick/per-message). GetService + // (not GetRequiredService) so a host that somehow omits the helper + // degrades to "no reconciliation tick" rather than a startup crash — + // the tick startup gates on both being non-null. + _pullClient = serviceProvider.GetService(); + _siteEnumerator = serviceProvider.GetService(); + RegisterHandlers(); } @@ -154,6 +248,75 @@ public class SiteCallAuditActor : ReceiveActor }); Receive(HandleRetrySiteCall); Receive(HandleDiscardSiteCall); + + // Piece A/B (#22): self-ticks for the periodic reconciliation pull and + // the daily terminal-row purge. Handlers stay alive across faults via + // their own per-site / per-tick try/catch (mirroring the ingest path); + // the timers are only started when their collaborators are available. + ReceiveAsync(_ => OnReconciliationTickAsync()); + ReceiveAsync(_ => OnPurgeTickAsync()); + } + + /// + protected override void PreStart() + { + base.PreStart(); + StartReconciliationTimer(); + StartPurgeTimer(); + } + + /// + protected override void PostStop() + { + _reconciliationTimer?.Cancel(); + _purgeTimer?.Cancel(); + base.PostStop(); + } + + /// + /// Starts the periodic reconciliation tick — but ONLY when both the pull + /// client and the site enumerator are available. The repo-only test ctor + /// injects neither, so the tick is gated off there (the MSSQL read/upsert + /// tests must not fire phantom pulls); the reconciliation test ctor and the + /// production ctor (which resolves both from the SP) start it. + /// + private void StartReconciliationTimer() + { + if (_pullClient is null || _siteEnumerator is null) + { + return; + } + + var interval = _options.ResolvedReconciliationInterval; + _reconciliationTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: ReconciliationTick.Instance, + sender: Self); + } + + /// + /// Starts the daily purge tick — gated on the same collaborator presence as + /// the reconciliation tick. The purge itself only needs the repository, but + /// gating both schedulers together keeps the repo-only test ctor (no + /// client/enumerator) free of BOTH background timers, so the MSSQL read/ + /// upsert tests see no scheduled side effects. + /// + private void StartPurgeTimer() + { + if (_pullClient is null || _siteEnumerator is null) + { + return; + } + + var interval = _options.ResolvedPurgeInterval; + _purgeTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: PurgeTick.Instance, + sender: Self); } /// @@ -212,6 +375,228 @@ public class SiteCallAuditActor : ReceiveActor } } + // ── Piece A: periodic per-site reconciliation pull (self-heal) ── + + /// + /// One reconciliation pass: enumerate every known site and, per site, pull + /// changed rows since that site's cursor and upsert + /// them idempotently — the documented self-heal when best-effort gRPC push + /// telemetry is lost. This is a mirror, NOT a dispatcher: cached-call + /// delivery stays site-local; upserting reconciled rows only refreshes the + /// eventually-consistent central SiteCalls mirror. + /// + /// + /// Mirrors SiteAuditReconciliationActor's structure (per-site cursor, + /// per-site try/catch failure isolation, advance the cursor by the max + /// observed ) but is deliberately simpler: + /// no stalled-detection EventStream machinery — just cursor + pull + upsert + /// + advance. One DI scope per tick is opened and the same repository reused + /// across every site in that tick. + /// + private async Task OnReconciliationTickAsync() + { + // The collaborators are guaranteed non-null: the tick is only scheduled + // when both are present (StartReconciliationTimer). Assert via the + // local copies so a future refactor that drops the gate fails loudly. + var enumerator = _siteEnumerator!; + var client = _pullClient!; + + IReadOnlyList sites; + try + { + // No ambient CancellationToken in a ReceiveActor handler — None is + // intentional; the work is bounded by the reconciliation interval + // plus the singleton's graceful-stop drain on PhaseClusterLeave. + sites = await enumerator.EnumerateAsync().ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError(ex, "SiteCallAudit site enumeration failed; skipping reconciliation tick."); + return; + } + + if (sites.Count == 0) + { + return; + } + + // AuditLog-003: open the scope INLINE with CreateAsyncScope + await using + // so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes + // asynchronously at end of tick rather than blocking the Akka dispatcher + // thread on a synchronous Dispose() of pending connection cleanup — the tick + // holds the scope across many awaited UpsertAsync calls. Mirrors the sibling + // SiteAuditReconciliationActor.OnTickAsync. ResolveRepository() (sync Dispose) + // is retained for the synchronous message-handler paths. In the injected- + // repository test path there is no scope to open and the test repo is reused. + if (_injectedRepository is not null) + { + await ReconcileSitesAsync(sites, client, _injectedRepository).ConfigureAwait(false); + return; + } + + await using var scope = _serviceProvider!.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + await ReconcileSitesAsync(sites, client, repository).ConfigureAwait(false); + } + + /// + /// Reconciles every site in the tick against a single resolved repository, + /// isolating per-site faults so one bad site never sinks the rest of the + /// pass (the failing site's cursor is left at its previous value so the next + /// tick retries the same window). + /// + private async Task ReconcileSitesAsync( + IReadOnlyList sites, IPullSiteCallsClient client, ISiteCallAuditRepository repository) + { + foreach (var site in sites) + { + try + { + await ReconcileSiteAsync(site, client, repository).ConfigureAwait(false); + } + catch (Exception ex) + { + // Failure-isolation invariant: one site's fault (transport, + // repository write) must NOT sink the rest of the tick. The + // failing site's cursor is left at its previous value so the + // next tick retries the same window. + _logger.LogWarning( + ex, + "SiteCallAudit reconciliation pull failed for site {SiteId}; other sites continue.", + site.SiteId); + } + } + } + + /// + /// Issues one PullSiteCalls RPC against the site, upserts the + /// returned rows idempotently, and advances the site's cursor to the maximum + /// observed. The pull client returns rows + /// oldest-first with SourceSite already re-stamped from the dialed + /// site id, so the actor upserts them verbatim (re-stamping + /// IngestedAtUtc at central persist time, as the telemetry path does). + /// + /// + /// + /// Coarse per-site retry — a deliberate divergence from + /// SiteAuditReconciliationActor. That sibling (AuditLog-004) tracks + /// a per-EventId attempt counter and permanently abandons a row after a + /// threshold so a single un-insertable row cannot block a site's cursor + /// forever. This actor deliberately does NOT: any throw inside the loop + /// propagates to 's per-site catch, + /// which leaves the site's cursor at its previous value, so the next tick + /// re-pulls the whole batch from since. A persistently-bad row therefore + /// holds the site's cursor and re-pulls the batch every tick. This is + /// acceptable here because is + /// monotonic and idempotent — re-pulling already-ingested rows is a cheap + /// no-op — and the SiteCalls table is an eventually-consistent mirror, + /// not the source of truth, so a slow site simply lags rather than corrupts. + /// + /// + /// Inclusive cursor boundary. The cursor is advanced to the maximum + /// seen, and the pull asks for rows at or + /// after it (since is >=, not >). The row whose + /// timestamp equals the cursor is therefore re-pulled on the next tick and + /// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary + /// contract as SiteAuditReconciliationActor's cursor. + /// + /// + private async Task ReconcileSiteAsync( + SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository) + { + var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; + var response = await client + .PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None) + .ConfigureAwait(false); + + var maxUpdated = since; + var nowUtc = DateTime.UtcNow; + foreach (var row in response.SiteCalls) + { + // IngestedAtUtc is the "central ingested (or last refreshed) this + // row" stamp — owned by the central actor, exactly as OnUpsertAsync + // does for the telemetry path. Monotonic UpsertAsync makes a row + // already present (from a prior push) a silent no-op. + var siteCall = row with { IngestedAtUtc = nowUtc }; + await repository.UpsertAsync(siteCall).ConfigureAwait(false); + + if (row.UpdatedAtUtc > maxUpdated) + { + maxUpdated = row.UpdatedAtUtc; + } + } + + // Advance the cursor to the newest row seen. A MoreAvailable response + // means the site saturated the batch; the next tick continues draining + // from the advanced cursor (no immediate re-pull loop — the natural + // tick cadence drains the backlog, matching SiteAuditReconciliationActor). + _reconciliationCursors[site.SiteId] = maxUpdated; + } + + // ── Piece B: daily terminal-row purge scheduler ── + + /// + /// One purge pass: drops terminal SiteCalls rows whose + /// is older than + /// UtcNow - RetentionDays via + /// . Non-terminal + /// rows are never purged (enforced in the repository). The threshold is + /// computed each tick so an operator who lowers RetentionDays sees it + /// applied on the next purge without an actor restart. Mirrors + /// AuditLogPurgeActor's daily cadence + continue-on-error posture: a + /// purge fault is logged and swallowed so the singleton stays alive. + /// + private async Task OnPurgeTickAsync() + { + var threshold = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays); + + // AuditLog-003: open the scope INLINE with CreateAsyncScope + await using + // so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes + // asynchronously rather than blocking the Akka dispatcher thread on a + // synchronous Dispose(). Mirrors SiteAuditReconciliationActor; the + // injected-repository test path reuses the test repo with no scope. + if (_injectedRepository is not null) + { + await PurgeWithRepositoryAsync(_injectedRepository, threshold).ConfigureAwait(false); + return; + } + + await using var scope = _serviceProvider!.CreateAsyncScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + await PurgeWithRepositoryAsync(repository, threshold).ConfigureAwait(false); + } + + /// + /// Runs one terminal-row purge against the resolved repository, logging and + /// swallowing any fault (continue-on-error) so a transient SQL failure or + /// contention never crashes the central singleton — the next tick retries + /// the same window. + /// + private async Task PurgeWithRepositoryAsync(ISiteCallAuditRepository repository, DateTime threshold) + { + try + { + var rowsDeleted = await repository.PurgeTerminalAsync(threshold).ConfigureAwait(false); + if (rowsDeleted > 0) + { + _logger.LogInformation( + "SiteCallAudit purged {RowsDeleted} terminal SiteCalls rows older than {ThresholdUtc:o}.", + rowsDeleted, + threshold); + } + } + catch (Exception ex) + { + // Continue-on-error: a purge fault (transient SQL failure, + // contention) must NOT crash the central singleton. The next tick + // retries the same window. + _logger.LogError( + ex, + "SiteCallAudit terminal-row purge failed (threshold {ThresholdUtc:o}); will retry next tick.", + threshold); + } + } + // ── Task 4: read-side (query / detail / KPI) ── /// @@ -693,6 +1078,20 @@ public class SiteCallAuditActor : ReceiveActor { return string.IsNullOrWhiteSpace(value) ? null : value; } + + /// Self-tick triggering a reconciliation pass across all sites (Piece A). + internal sealed class ReconciliationTick + { + public static readonly ReconciliationTick Instance = new(); + private ReconciliationTick() { } + } + + /// Self-tick triggering a terminal-row purge pass (Piece B). + internal sealed class PurgeTick + { + public static readonly PurgeTick Instance = new(); + private PurgeTick() { } + } } /// diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs index a5db3102..317b29f9 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditOptions.cs @@ -1,11 +1,13 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit; /// -/// Configuration options for the Site Call Audit (#22) read-side: stuck-call -/// detection and KPI windowing. Mirrors the KPI-relevant subset of -/// NotificationOutboxOptions — the reconciliation, purge and dispatch -/// cadence options the Notification Outbox carries are not part of the Site -/// Call Audit read-side backend and are deliberately omitted here. +/// Configuration options for the Site Call Audit (#22): stuck-call detection + +/// KPI windowing for the read-side, plus the cadence/retention knobs for the +/// two central-singleton schedulers — the periodic per-site reconciliation +/// pull (self-heal for lost telemetry) and the daily terminal-row purge. +/// Mirrors the KPI-relevant subset of NotificationOutboxOptions and the +/// scheduler-cadence shape of SiteAuditReconciliationOptions / +/// AuditLogPurgeOptions. /// public class SiteCallAuditOptions { @@ -44,4 +46,99 @@ public class SiteCallAuditOptions /// /// public TimeSpan RelayTimeout { get; set; } = TimeSpan.FromSeconds(10); + + // ── Reconciliation tick (#22): periodic per-site self-heal pull ── + + /// + /// Period of the reconciliation tick. Each tick visits every known site + /// once, pulls changed SiteCall rows since a per-site cursor, and + /// upserts them idempotently — the documented self-heal when best-effort + /// push telemetry is lost. Default 5 minutes, matching the sibling + /// SiteAuditReconciliationOptions (#23) cadence. Clamped to at least + /// via . + /// + public TimeSpan ReconciliationInterval { get; set; } = TimeSpan.FromMinutes(5); + + /// + /// Test-only override for the reconciliation tick cadence — bypasses the + /// clamp so unit tests can drop the + /// cadence to milliseconds. Production config never sets this; leave null. + /// + public TimeSpan? ReconciliationIntervalOverride { get; set; } + + /// + /// Maximum number of SiteCall rows requested per PullSiteCalls + /// RPC. Default 500. A MoreAvailable=true response signals the cursor + /// advanced and the next tick should keep draining the backlog. + /// + public int ReconciliationBatchSize { get; set; } = 500; + + /// + /// Minimum interval the config-bound can + /// resolve to. Clamps a misconfigured 0 (or negative) value away from + /// , which would make Akka's + /// ScheduleTellRepeatedlyCancelable spin — the exact footgun flagged in + /// a prior review of the sibling reconciliation options. + /// + private static readonly TimeSpan MinReconciliationInterval = TimeSpan.FromSeconds(1); + + /// + /// Resolves the effective reconciliation tick interval: the test override + /// when set (bypassing the clamp), otherwise + /// clamped to at least so a + /// zero/negative config value can never yield . + /// + public TimeSpan ResolvedReconciliationInterval => + ReconciliationIntervalOverride is { } o + ? o + : ReconciliationInterval < MinReconciliationInterval + ? MinReconciliationInterval + : ReconciliationInterval; + + // ── Purge scheduler (#22): daily terminal-row purge ── + + /// + /// Period of the purge tick. Each tick drops terminal SiteCalls rows + /// older than the retention window via + /// . + /// Default 24 hours, matching AuditLogPurgeOptions. Clamped to at + /// least via . + /// + public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24); + + /// + /// Test-only override for the purge tick cadence — bypasses the + /// clamp so unit tests can drop the cadence + /// to milliseconds. Production config never sets this; leave null. + /// + public TimeSpan? PurgeIntervalOverride { get; set; } + + /// + /// Retention window for terminal rows. On each purge tick a row whose + /// TerminalAtUtc is older than UtcNow - RetentionDays is + /// deleted; non-terminal rows are never purged. Default 365 days, matching + /// the central audit-store retention policy. + /// + public int RetentionDays { get; set; } = 365; + + /// + /// Minimum interval the config-bound can resolve + /// to. Clamps a misconfigured 0 (or negative) value away from + /// for the same scheduler-spin reason as + /// ; the purge is daily so the floor + /// is a more generous 1 minute. + /// + private static readonly TimeSpan MinPurgeInterval = TimeSpan.FromMinutes(1); + + /// + /// Resolves the effective purge tick interval: the test override when set + /// (bypassing the clamp), otherwise clamped to at + /// least . + /// + public TimeSpan ResolvedPurgeInterval => + PurgeIntervalOverride is { } o + ? o + : PurgeInterval < MinPurgeInterval + ? MinPurgeInterval + : PurgeInterval; } diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj index cca4f3eb..c2de6728 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj +++ b/src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj @@ -29,6 +29,15 @@ the same transport every other central→site command uses. SiteEnvelope is defined in ZB.MOM.WW.ScadaBridge.Communication (no cycle: Communication does not reference SiteCallAudit). --> + + diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs index 07fc9ea4..630822ef 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/ISiteEventLogger.cs @@ -11,7 +11,7 @@ public interface ISiteEventLogger /// completes once the event is durably persisted and faults if /// the write fails, so callers that await it observe success or failure. /// - /// Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle + /// Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle, notification /// Info, Warning, or Error /// Optional instance ID associated with the event /// Source identifier, e.g., "ScriptActor:MonitorSpeed" diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs index bafd20d9..ca9e2db6 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/AlarmActor.cs @@ -1,10 +1,12 @@ using Akka.Actor; using Microsoft.CodeAnalysis.Scripting; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; using System.Globalization; using System.Text.Json; @@ -37,6 +39,25 @@ public class AlarmActor : ReceiveActor private readonly SiteRuntimeOptions _options; private readonly ILogger _logger; private readonly ISiteHealthCollector? _healthCollector; + private readonly IServiceProvider? _serviceProvider; + + /// + /// M1.5: the optional site operational-event log, resolved once from + /// at construction and cached. The + /// registration is process-lifetime (a singleton), so resolving once on + /// the actor's own thread is both correct and cheaper than a per-event + /// GetService on the hot path. null when no provider was + /// supplied (the test/no-logging path) — then + /// no-ops. + /// + private readonly ISiteEventLogger? _siteEventLogger; + + /// + /// M1.5: priority at or above which a computed-alarm raise is logged as + /// Error to the site event log; below it, raises log as Warning. + /// Mirrors the 0–1000 alarm-severity scale. + /// + private const int ErrorPriorityThreshold = 700; private AlarmState _currentState = AlarmState.Normal; /// @@ -83,6 +104,9 @@ public class AlarmActor : ReceiveActor /// Pre-compiled trigger expression, or null for non-expression triggers. /// Seed attribute snapshot so static attributes evaluate correctly at startup. /// Optional health collector for surfacing alarm execution metrics. + /// Optional DI service provider used to resolve the optional + /// for M1.5 alarm operational events. Fire-and-forget; + /// a logging failure never affects alarm evaluation. public AlarmActor( string alarmName, string instanceName, @@ -94,7 +118,8 @@ public class AlarmActor : ReceiveActor ILogger logger, Script? compiledTriggerExpression = null, IReadOnlyDictionary? initialAttributes = null, - ISiteHealthCollector? healthCollector = null) + ISiteHealthCollector? healthCollector = null, + IServiceProvider? serviceProvider = null) { _alarmName = alarmName; _instanceName = instanceName; @@ -103,6 +128,10 @@ public class AlarmActor : ReceiveActor _options = options; _logger = logger; _healthCollector = healthCollector; + _serviceProvider = serviceProvider; + // M1.5: resolve the optional site event logger once and cache it, + // rather than calling GetService on every alarm transition. + _siteEventLogger = serviceProvider?.GetService(); _priority = alarmConfig.PriorityLevel; _onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName; _onTriggerCompiledScript = onTriggerCompiledScript; @@ -208,6 +237,9 @@ public class AlarmActor : ReceiveActor _instanceName, _alarmName, AlarmState.Active, _priority, DateTimeOffset.UtcNow); _instanceActor.Tell(alarmChanged); + // M1.5: operational `alarm` event — raise. Severity by priority. + LogAlarmEvent(RaiseSeverity(_priority), $"Alarm {_alarmName} activated (priority {_priority})"); + // Spawn AlarmExecutionActor if on-trigger script defined if (_onTriggerCompiledScript != null) { @@ -225,6 +257,9 @@ public class AlarmActor : ReceiveActor var alarmChanged = new AlarmStateChanged( _instanceName, _alarmName, AlarmState.Normal, _priority, DateTimeOffset.UtcNow); _instanceActor.Tell(alarmChanged); + + // M1.5: operational `alarm` event — return to normal. + LogAlarmEvent("Info", $"Alarm {_alarmName} cleared"); } } catch (Exception ex) @@ -265,6 +300,24 @@ public class AlarmActor : ReceiveActor }; _instanceActor.Tell(alarmChanged); + // M1.5: operational `alarm` event. Entering a band from Normal is a raise + // (severity by the band's priority); returning to None is a clear; a + // level-to-level escalation/de-escalation is an informational transition. + if (newLevel == AlarmLevel.None) + { + LogAlarmEvent("Info", $"Alarm {_alarmName} cleared ({previousLevel} → Normal)"); + } + else if (previousLevel == AlarmLevel.None) + { + LogAlarmEvent(RaiseSeverity(priority), + $"Alarm {_alarmName} activated at {newLevel} (priority {priority})"); + } + else + { + LogAlarmEvent("Info", + $"Alarm {_alarmName} transitioned {previousLevel} → {newLevel} (priority {priority})"); + } + if (previousLevel == AlarmLevel.None && newLevel != AlarmLevel.None && _onTriggerCompiledScript != null) @@ -273,6 +326,28 @@ public class AlarmActor : ReceiveActor } } + /// + /// M1.5: maps an alarm priority (0–1000) to a site-event severity for a + /// raise transition — Error at or above + /// , otherwise Warning. Clears and + /// inter-band transitions always log as Info. + /// + private static string RaiseSeverity(int priority) => + priority >= ErrorPriorityThreshold ? "Error" : "Warning"; + + /// + /// M1.5: fire-and-forget an alarm operational event to the optional + /// (resolved once at construction and cached + /// in ). Never awaited so a logging failure + /// cannot affect alarm evaluation (matching the established + /// ScriptActor/ScriptExecutionActor pattern). + /// + private void LogAlarmEvent(string severity, string message) + { + _ = _siteEventLogger?.LogEventAsync( + "alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message); + } + /// /// Returns the per-setpoint priority for the given level. Falls back to /// the alarm-level when the HiLo config did not diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs index 0e9dab79..10a6aae6 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -1,4 +1,5 @@ using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView; @@ -10,6 +11,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management; using ZB.MOM.WW.ScadaBridge.Commons.Messages.ScriptExecution; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; @@ -456,6 +458,10 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers { if (result.Success) { + // M1.6: operational `deployment` event — deploy succeeded. + LogDeploymentEvent("Info", result.InstanceName, + $"Instance {result.InstanceName} deployed (deploymentId={result.DeploymentId})"); + result.OriginalSender.Tell(new DeploymentStatusResponse( result.DeploymentId, result.InstanceName, @@ -469,6 +475,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers "Failed to persist deployment {DeploymentId} for {Instance}: {Error}", result.DeploymentId, result.InstanceName, result.Error); + // M1.6: operational `deployment` event — deploy failed. + LogDeploymentEvent("Error", result.InstanceName, + $"Instance {result.InstanceName} deploy failed (deploymentId={result.DeploymentId})", + result.Error); + // Persistence failed — undo the optimistic actor creation and counter bump so // the site does not advertise an instance it cannot durably recover. if (_instanceActors.Remove(result.InstanceName, out var orphan)) @@ -504,7 +515,17 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _storage.SetInstanceEnabledAsync(instanceName, false).ContinueWith(t => { if (t.IsCompletedSuccessfully) + { _replicationActor?.Tell(new ReplicateConfigSetEnabled(instanceName, false)); + // M1.6: operational `deployment` event — disable succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} disabled"); + } + else + { + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} disable failed", + t.Exception?.GetBaseException().Message); + } return new InstanceLifecycleResponse( command.CommandId, @@ -551,6 +572,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers if (result.Error != null || result.Config == null) { var error = result.Error ?? $"No deployed config found for {instanceName}"; + // M1.6: operational `deployment` event — enable failed. + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} enable failed", error); result.OriginalSender.Tell(new InstanceLifecycleResponse( result.Command.CommandId, instanceName, false, error, DateTimeOffset.UtcNow)); return; @@ -562,6 +586,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } UpdateInstanceCounts(); + // M1.6: operational `deployment` event — enable succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} enabled"); + result.OriginalSender.Tell(new InstanceLifecycleResponse( result.Command.CommandId, instanceName, true, null, DateTimeOffset.UtcNow)); @@ -588,7 +615,17 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _storage.RemoveDeployedConfigAsync(instanceName).ContinueWith(t => { if (t.IsCompletedSuccessfully) + { _replicationActor?.Tell(new ReplicateConfigRemove(instanceName)); + // M1.6: operational `deployment` event — delete succeeded. + LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} deleted"); + } + else + { + LogDeploymentEvent("Error", instanceName, + $"Instance {instanceName} delete failed", + t.Exception?.GetBaseException().Message); + } return new InstanceLifecycleResponse( command.CommandId, @@ -601,6 +638,30 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _logger.LogInformation("Instance {Instance} deleted", instanceName); } + /// + /// M1.6: fire-and-forget a deployment operational event to the optional + /// on a deploy/enable/disable/delete outcome. + /// Resolved optionally and never awaited so a logging failure cannot affect the + /// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor + /// pattern). + /// + /// Thread-safety: the disable () and delete + /// () paths call this from a + /// + /// continuation that runs on a thread-pool thread, NOT on the actor thread — + /// so it must touch only immutable, thread-safe state. It does: the only + /// field it reads is the readonly _serviceProvider captured at + /// construction (the resolved is a process + /// singleton). No actor-private mutable state is referenced, which is what + /// makes calling it off the actor thread safe. + /// + /// + private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null) + { + _ = _serviceProvider?.GetService()?.LogEventAsync( + "deployment", severity, instanceName, "DeploymentManagerActor", message, details); + } + /// /// DeploymentManager-006: answers a central query for the instance's /// currently-applied deployment identity. The site's deployed-config store diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs index 75909a8e..e6f9a3fc 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/InstanceActor.cs @@ -1,5 +1,6 @@ using Akka.Actor; using Microsoft.CodeAnalysis.Scripting; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView; @@ -9,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming; @@ -164,6 +166,11 @@ public class InstanceActor : ReceiveActor base.PreStart(); _logger.LogInformation("InstanceActor started for {Instance}", _instanceUniqueName); + // M1.6: operational `instance_lifecycle` event — instance started. + // An instance starts on deploy, on enable (DeploymentManager re-creates + // the actor), and on failover/restart; this single point covers them all. + LogLifecycleEvent($"Instance {_instanceUniqueName} started"); + // Asynchronously load static overrides from SQLite and pipe to self var self = Self; _storage.GetStaticOverridesAsync(_instanceUniqueName).ContinueWith(t => @@ -180,6 +187,29 @@ public class InstanceActor : ReceiveActor SubscribeToDcl(); } + /// + protected override void PostStop() + { + // M1.6: operational `instance_lifecycle` event — instance stopped. An + // instance stops on disable, delete, redeployment, and graceful shutdown; + // this single point covers them all. + LogLifecycleEvent($"Instance {_instanceUniqueName} stopped"); + base.PostStop(); + } + + /// + /// M1.6: fire-and-forget an instance_lifecycle operational event to the + /// optional . Resolved optionally and never + /// awaited so a logging failure cannot affect the instance lifecycle + /// (matching the established ScriptActor/ScriptExecutionActor pattern). + /// + private void LogLifecycleEvent(string message) + { + _ = _serviceProvider?.GetService()?.LogEventAsync( + "instance_lifecycle", "Info", _instanceUniqueName, + $"InstanceActor:{_instanceUniqueName}", message); + } + /// protected override SupervisorStrategy SupervisorStrategy() { @@ -763,7 +793,8 @@ public class InstanceActor : ReceiveActor _logger, triggerExpression, attributeSnapshot, - _healthCollector)); + _healthCollector, + _serviceProvider)); var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}"); _alarmActors[alarm.CanonicalName] = actorRef; @@ -793,7 +824,8 @@ public class InstanceActor : ReceiveActor _storage, _options, _logger, - nativeKind)); + nativeKind, + _serviceProvider)); var actorRef = Context.ActorOf(props, $"native-alarm-{nativeSource.CanonicalName}"); _nativeAlarmActors[nativeSource.CanonicalName] = actorRef; diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs index 28747cea..63a354ed 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/NativeAlarmActor.cs @@ -1,11 +1,13 @@ using System.Text.Json; using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using ZB.MOM.WW.ScadaBridge.Commons.Messages.DataConnection; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming; using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; @@ -35,6 +37,14 @@ public class NativeAlarmActor : ReceiveActor private readonly SiteRuntimeOptions _options; private readonly ILogger _logger; private readonly AlarmKind _nativeKind; + private readonly IServiceProvider? _serviceProvider; + + /// + /// M1.5: severity at or above which a native-alarm raise is logged as + /// Error to the site event log; below it, raises log as Warning. + /// Mirrors the 0–1000 condition-severity scale. + /// + private const int ErrorSeverityThreshold = 700; /// Current mirrored conditions, keyed by source reference. private readonly Dictionary _alarms = new(); @@ -54,6 +64,9 @@ public class NativeAlarmActor : ReceiveActor /// Logger for diagnostics. /// Alarm kind to stamp on emitted events (OPC UA vs MxAccess); set by the /// Instance Actor from the connection protocol. Defaults to . + /// Optional DI service provider used to resolve the optional + /// for M1.5 alarm operational events. Fire-and-forget; + /// a logging failure never affects the mirror. public NativeAlarmActor( ResolvedNativeAlarmSource source, string instanceName, @@ -62,7 +75,8 @@ public class NativeAlarmActor : ReceiveActor SiteStorageService storage, SiteRuntimeOptions options, ILogger logger, - AlarmKind nativeKind = AlarmKind.NativeOpcUa) + AlarmKind nativeKind = AlarmKind.NativeOpcUa, + IServiceProvider? serviceProvider = null) { _source = source; _instanceName = instanceName; @@ -72,6 +86,7 @@ public class NativeAlarmActor : ReceiveActor _options = options; _logger = logger; _nativeKind = nativeKind; + _serviceProvider = serviceProvider; Receive(HandleRehydration); Receive(HandleTransition); @@ -150,7 +165,10 @@ public class NativeAlarmActor : ReceiveActor condition, string.Empty, string.Empty, string.Empty, string.Empty, string.Empty, null, row.LastTransitionAt, string.Empty, string.Empty); _alarms[row.SourceReference] = t; - Emit(t, t.Condition); + // M1.5: rehydration replays last-known state on (re)start — surface it + // upward for the DebugView but do NOT re-log it as a fresh operational + // event (it is not a live transition). + Emit(t, t.Condition, logSiteEvent: false); } } @@ -194,7 +212,14 @@ public class NativeAlarmActor : ReceiveActor { _alarms[sourceRef] = t; PersistUpsert(t); - Emit(t, t.Condition); + // M1.5: a snapshot replay is a re-sync of the source's current + // active set on (re)subscribe, NOT a live transition — surface it + // upward for the DebugView but do NOT re-log an `alarm` operational + // event. Otherwise every DCL reconnect would re-emit an `alarm` + // event for every already-active native condition (the + // synthesised return-to-normal above IS a real state change and + // keeps logSiteEvent: true). + Emit(t, t.Condition, logSiteEvent: false); } _snapshotBuffer.Clear(); @@ -277,8 +302,16 @@ public class NativeAlarmActor : ReceiveActor } } - /// Builds and tells the parent an enriched for a condition. - private void Emit(NativeAlarmTransition t, AlarmConditionState condition) + /// + /// Builds and tells the parent an enriched for a condition. + /// + /// The mirrored transition. + /// The condition state to surface (may differ from 's + /// own condition, e.g. a synthesised return-to-normal on snapshot swap). + /// M1.5: when true (live + snapshot transitions), emit an + /// alarm operational event. Suppressed for SQLite rehydration so a node restart does not + /// re-log every last-known condition. + private void Emit(NativeAlarmTransition t, AlarmConditionState condition, bool logSiteEvent = true) { var change = new AlarmStateChanged( _instanceName, @@ -301,6 +334,49 @@ public class NativeAlarmActor : ReceiveActor }; _instanceActor.Tell(change); + + if (logSiteEvent) + { + LogAlarmEvent(t, condition); + } + } + + /// + /// M1.5: fire-and-forget an alarm operational event mirroring a native + /// condition transition. An active condition is a raise (severity by the + /// condition's severity); an inactive condition is a return-to-normal; an + /// acknowledge transition is informational. Resolved optionally and never + /// awaited so a logging failure cannot affect the mirror (matching the + /// established ScriptActor/ScriptExecutionActor pattern). + /// + private void LogAlarmEvent(NativeAlarmTransition t, AlarmConditionState condition) + { + var logger = _serviceProvider?.GetService(); + if (logger == null) + { + return; + } + + string severity; + string message; + if (t.Kind == AlarmTransitionKind.Acknowledge) + { + severity = "Info"; + message = $"Native alarm {t.SourceReference} acknowledged"; + } + else if (condition.Active) + { + severity = condition.Severity >= ErrorSeverityThreshold ? "Error" : "Warning"; + message = $"Native alarm {t.SourceReference} active (severity {condition.Severity})"; + } + else + { + severity = "Info"; + message = $"Native alarm {t.SourceReference} returned to normal"; + } + + _ = logger.LogEventAsync( + "alarm", severity, _instanceName, $"NativeAlarmActor:{_source.CanonicalName}", message); } private void PersistUpsert(NativeAlarmTransition t) diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs index 77d2e58f..22cc6034 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/ScriptExecutionActor.cs @@ -217,6 +217,13 @@ public class ScriptExecutionActor : ReceiveActor Scope = scope }; + // M1.8: operational `script` event — execution started. Fire-and-forget + // (the `_ =` discards the task) so the event log can never block or + // fault the script's own run; mirrors the existing Error-path emit. + _ = siteEventLogger?.LogEventAsync( + "script", "Info", instanceName, $"ScriptActor:{scriptName}", + $"Script '{scriptName}' on instance '{instanceName}' started"); + var state = await compiledScript.RunAsync(globals, cts.Token); // Send result to requester if this was an Ask-based call @@ -225,6 +232,11 @@ public class ScriptExecutionActor : ReceiveActor replyTo.Tell(new ScriptCallResult(correlationId, true, state.ReturnValue, null)); } + // M1.8: operational `script` event — execution completed successfully. + _ = siteEventLogger?.LogEventAsync( + "script", "Info", instanceName, $"ScriptActor:{scriptName}", + $"Script '{scriptName}' on instance '{instanceName}' completed"); + // Notify parent of completion parent.Tell(new ScriptActor.ScriptExecutionCompleted(scriptName, true, null)); } diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs index 8cec600b..c48964b9 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/OperationTrackingStore.cs @@ -91,6 +91,8 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, ); CREATE INDEX IF NOT EXISTS IX_OperationTracking_Status_Updated ON OperationTracking (Status, UpdatedAtUtc); + CREATE INDEX IF NOT EXISTS IX_OperationTracking_UpdatedAt + ON OperationTracking (UpdatedAtUtc); """; cmd.ExecuteNonQuery(); @@ -360,6 +362,84 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, } } + /// + public async Task> ReadChangedSinceAsync( + DateTime sinceUtc, + int batchSize, + CancellationToken ct = default) + { + ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this); + + // SiteRuntime-024: like GetStatusAsync, the reconciliation pull opens a + // fresh, ungated read connection so a long-running write never blocks + // central's PullSiteCalls. The query is a bounded, ordered scan served by + // the standalone IX_OperationTracking_UpdatedAt index — UpdatedAtUtc is + // the cursor. (The composite (Status, UpdatedAtUtc) index cannot satisfy a + // status-less UpdatedAtUtc range scan; this dedicated index does.) + await using var readConnection = new SqliteConnection(_connectionString); + await readConnection.OpenAsync(ct).ConfigureAwait(false); + + await using var cmd = readConnection.CreateCommand(); + // Inclusive lower bound on UpdatedAtUtc (>=) so a caller resuming from + // the last returned timestamp does not skip a row sharing that instant; + // central ingest is insert-if-not-exists + upsert-on-newer, so the + // boundary row re-read is a no-op. ORDER BY ... ASC + LIMIT yields the + // OLDEST matching rows so the cursor advances monotonically. + cmd.CommandText = """ + SELECT TrackedOperationId, Kind, TargetSummary, Status, + RetryCount, LastError, HttpStatus, + CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc, SourceNode + FROM OperationTracking + WHERE UpdatedAtUtc >= $since + ORDER BY UpdatedAtUtc ASC + LIMIT $batchSize; + """; + // Force UTC kind before formatting so the cursor's "o" text matches the + // 'Z'-suffixed round-trip form the write path persists (DateTime.UtcNow + // .ToString("o")). A first-cycle DateTime.MinValue arrives Unspecified — + // without this its "o" rendering would lack the 'Z', and the SQLite text + // compare against 'Z'-suffixed stored values would be subtly inconsistent. + var sinceText = DateTime + .SpecifyKind(sinceUtc, DateTimeKind.Utc) + .ToString("o", CultureInfo.InvariantCulture); + cmd.Parameters.AddWithValue("$since", sinceText); + cmd.Parameters.AddWithValue("$batchSize", batchSize); + + var rows = new List(); + await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false); + while (await reader.ReadAsync(ct).ConfigureAwait(false)) + { + var kind = reader.GetString(1); + rows.Add(new SiteCallOperational( + TrackedOperationId: TrackedOperationId.Parse(reader.GetString(0)), + Channel: KindToChannel(kind), + Target: reader.IsDBNull(2) ? string.Empty : reader.GetString(2), + // The site id is not a tracking-store column; the central client + // re-stamps SourceSite from the siteId it dialed. + SourceSite: string.Empty, + SourceNode: reader.IsDBNull(10) ? null : reader.GetString(10), + Status: reader.GetString(3), + RetryCount: reader.GetInt32(4), + LastError: reader.IsDBNull(5) ? null : reader.GetString(5), + HttpStatus: reader.IsDBNull(6) ? null : reader.GetInt32(6), + CreatedAtUtc: ParseUtc(reader.GetString(7)), + UpdatedAtUtc: ParseUtc(reader.GetString(8)), + TerminalAtUtc: reader.IsDBNull(9) ? null : ParseUtc(reader.GetString(9)))); + } + + return rows; + } + + // Cached-call Kind → SiteCalls Channel. Only ApiCallCached / DbWriteCached + // ever reach the tracking store (RecordEnqueueAsync is the cached-call + // entry point); DbWriteCached maps to DbOutbound, everything else to the + // ApiOutbound default. Mirrors CachedCallLifecycleBridge's channel handling. + private static string KindToChannel(string kind) => kind switch + { + nameof(Commons.Types.Enums.AuditKind.DbWriteCached) => nameof(Commons.Types.Enums.AuditChannel.DbOutbound), + _ => nameof(Commons.Types.Enums.AuditChannel.ApiOutbound), + }; + private static DateTime ParseUtc(string raw) { return DateTime.Parse( diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs index e0cf2a46..163a0cb8 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ServiceCollectionExtensions.cs @@ -2,6 +2,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; namespace ZB.MOM.WW.ScadaBridge.StoreAndForward; @@ -49,13 +50,19 @@ public static class ServiceCollectionExtensions // observable in the central audit log instead of producing a // silent empty-string SourceSite. var siteId = siteContext?.SiteId ?? string.Empty; + // M1.7: optional site operational-event log. Resolved through + // GetService so a host (or test) that has not called + // AddSiteEventLogging simply gets null and the S&F activity stays + // a no-op for site-event purposes. + var siteEventLogger = sp.GetService(); return new StoreAndForwardService( storage, options, logger, replication, cachedCallObserver, - siteId); + siteId, + siteEventLogger); }); services.AddSingleton(sp => diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs index 6b082579..b1c3b9e2 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs @@ -3,6 +3,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services; using ZB.MOM.WW.ScadaBridge.Commons.Observability; using ZB.MOM.WW.ScadaBridge.Commons.Types; using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; namespace ZB.MOM.WW.ScadaBridge.StoreAndForward; @@ -44,6 +45,15 @@ public class StoreAndForwardService /// private readonly ICachedCallLifecycleObserver? _cachedCallObserver; /// + /// M1.7: optional site operational-event log. When non-null the service maps + /// its own buffer/retry/park activity (the same activity that drives + /// ) onto site events — store_and_forward for the + /// cached-call categories and notification for the site's + /// forward-to-central notification path. Best-effort and fire-and-forget so a + /// failing logger never affects delivery bookkeeping. + /// + private readonly ISiteEventLogger? _siteEventLogger; + /// /// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the /// cached-call attempt context so the audit bridge can build the /// half of the telemetry packet. @@ -72,6 +82,18 @@ public class StoreAndForwardService /// recognisable instead of an unattributable empty string. /// public const string UnknownSiteSentinel = "$unknown-site"; + + /// + /// M1.7: the detail-string prefix written by + /// when an immediate forward attempt throws and the message is buffered for + /// the retry sweep. matches on this same prefix + /// to distinguish a forward failure (logged) from a routine + /// no-handler enqueue (not logged), so both the construction site and the + /// check reference this single constant rather than duplicating the + /// literal — keeping the two ends from drifting apart. + /// + private const string BufferedForRetryDetailPrefix = "Buffered for retry"; + private Timer? _retryTimer; private int _retryInProgress; @@ -173,13 +195,20 @@ public class StoreAndForwardService /// Optional replication service for standby synchronization. /// Optional observer for cached call lifecycle events. /// The site identifier this service belongs to. + /// + /// M1.7: optional site operational-event log. When non-null, buffer/retry/park + /// activity is mirrored to site events (store_and_forward / + /// notification by category). Optional with a null default so the + /// many direct-construction tests still compile unchanged. + /// public StoreAndForwardService( StoreAndForwardStorage storage, StoreAndForwardOptions options, ILogger logger, ReplicationService? replication = null, ICachedCallLifecycleObserver? cachedCallObserver = null, - string siteId = "") + string siteId = "", + ISiteEventLogger? siteEventLogger = null) { _storage = storage; _options = options; @@ -191,6 +220,92 @@ public class StoreAndForwardService // audit pipeline keying off SourceSite) never see an empty string and // a misconfigured host is recognisable in the central log. _siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId; + _siteEventLogger = siteEventLogger; + + // M1.7: ride the existing activity hook to emit site operational events. + // RaiseActivity already isolates a throwing subscriber, so a failing + // event log can never be misclassified as a transient delivery failure + // (StoreAndForward-009). Only subscribe when a logger is wired so the + // legacy (test/central) construction path stays a no-op. + if (_siteEventLogger != null) + { + OnActivity += EmitSiteEvent; + } + } + + /// + /// M1.7: maps one store-and-forward activity to a site operational event, + /// following the Site Event Logging spec's per-category scope + /// (Component-SiteEventLogging.md §"Events Logged"): + /// + /// Cached-call categories + /// ( / + /// ) log under + /// store_and_forward for queued / retried / parked / retry-delivered + /// activity. + /// The site's notification forward-to-central path + /// () logs under + /// notification ONLY on a forward FAILURE (buffered after the + /// immediate forward threw) or a park (long-buffered / retries exhausted). + /// Routine enqueue and forward-success are deliberately NOT logged — central's + /// Notifications table is the record of audit; the site only fills the + /// in-transit blind spot when central is unreachable. + /// + /// A successful immediate cached-call Delivered is the normal hot path and + /// is not logged. + /// + private void EmitSiteEvent(string action, StoreAndForwardCategory category, string detail) + { + var logger = _siteEventLogger; + if (logger == null) + { + return; + } + + // An immediate-delivery success is the normal hot path, not an + // operational event. A retry-loop success (detail "Delivered to … after + // N retries") IS logged for cached calls — it records a recovery. + if (action == "Delivered" && detail.StartsWith("Immediate", StringComparison.Ordinal)) + { + return; + } + + if (category == StoreAndForwardCategory.Notification) + { + // Spec: log only forward-failure (the immediate forward threw and the + // notification was buffered for retry — detail prefixed + // BufferedForRetryDetailPrefix) and park. A routine "No handler + // registered, buffered" enqueue and a forward-success "Delivered" + // are deliberately NOT logged. + var isForwardFailure = action == "Queued" + && detail.StartsWith(BufferedForRetryDetailPrefix, StringComparison.Ordinal); + if (!isForwardFailure && action != "Parked") + { + return; + } + + var notifSeverity = action == "Parked" ? "Error" : "Warning"; + _ = logger.LogEventAsync( + "notification", notifSeverity, instanceId: null, + source: "StoreAndForwardService", + message: $"Notification {action.ToLowerInvariant()}: {detail}"); + return; + } + + // Cached-call categories: queued / retried / parked / retry-delivered. + // Severity: parking is an Error (delivery abandoned for retry purposes); + // queue/retry/requeue are Warning; a retry-loop Delivered is Info. + var severity = action switch + { + "Parked" => "Error", + "Delivered" => "Info", + _ => "Warning", + }; + + _ = logger.LogEventAsync( + "store_and_forward", severity, instanceId: null, + source: "StoreAndForwardService", + message: $"Operation {action.ToLowerInvariant()}: {detail}"); } /// @@ -434,7 +549,7 @@ public class StoreAndForwardService message.LastError = ex.Message; await BufferAsync(message); - RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})"); + RaiseActivity("Queued", category, $"{BufferedForRetryDetailPrefix}: {target} ({ex.Message})"); return new StoreAndForwardResult(true, message.Id, true); } } @@ -451,7 +566,7 @@ public class StoreAndForwardService await BufferAsync(message); RaiseActivity("Queued", category, attemptImmediateDelivery ? $"No handler registered, buffered: {target}" - : $"Buffered for retry: {target}"); + : $"{BufferedForRetryDetailPrefix}: {target}"); return new StoreAndForwardResult(true, message.Id, true); } diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj index c997e571..4f568eb4 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/ZB.MOM.WW.ScadaBridge.StoreAndForward.csproj @@ -17,6 +17,7 @@ + diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs new file mode 100644 index 00000000..7664b36c --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullAuditEventsClientTests.cs @@ -0,0 +1,215 @@ +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.Audit; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using Google.Protobuf.WellKnownTypes; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullAuditEventsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Bundle (M6) tests for — the +/// production that dials a site over gRPC +/// and issues the PullAuditEvents unary RPC for the reconciliation loop. +/// The real GrpcChannel is replaced by an injected +/// seam so the +/// client's mapping / ordering / fault-swallowing behaviour can be asserted +/// without standing up a Kestrel HTTP/2 endpoint. +/// +public class GrpcPullAuditEventsClientTests +{ + private static readonly DateTime BaseTime = + new(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + + /// Static enumerator returning a fixed site→endpoint map. + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Test invoker: records the endpoint + request it was asked to dial, then + /// returns a scripted proto response (or throws a scripted exception so the + /// fault-swallowing path can be exercised). + /// + private sealed class FakeInvoker : GrpcPullAuditEventsClient.IPullAuditEventsInvoker + { + public string? Endpoint { get; private set; } + public ProtoPullRequest? Request { get; private set; } + public int CallCount { get; private set; } + + private readonly ProtoPullResponse? _response; + private readonly Exception? _throw; + + private FakeInvoker(ProtoPullResponse? response, Exception? toThrow) + { + _response = response; + _throw = toThrow; + } + + public static FakeInvoker Returning(ProtoPullResponse response) => new(response, null); + public static FakeInvoker Throwing(Exception ex) => new(null, ex); + + public Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + CallCount++; + Endpoint = endpoint; + Request = request; + if (_throw is not null) + { + throw _throw; + } + return Task.FromResult(_response!); + } + } + + private static AuditEventDto Dto(Guid id, DateTime occurredAtUtc) => + AuditEventDtoMapper.ToDto(ScadaBridgeAuditEventFactory.Create( + eventId: id, + occurredAtUtc: occurredAtUtc, + channel: AuditChannel.ApiOutbound, + kind: AuditKind.ApiCall, + status: AuditStatus.Delivered, + sourceSiteId: "site-a")); + + [Fact] + public async Task PullAsync_dials_the_resolved_endpoint_and_maps_events_oldest_first() + { + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + // Wire is delivered newest-first on purpose to prove the client sorts. + var proto = new ProtoPullResponse { MoreAvailable = true }; + proto.Events.Add(Dto(newer, BaseTime.AddMinutes(5))); + proto.Events.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + // Endpoint resolution + request shaping. + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(256, invoker.Request!.BatchSize); + Assert.Equal(BaseTime, invoker.Request.SinceUtc.ToDateTime()); + + // Mapping + ordering + MoreAvailable surface. + Assert.True(result.MoreAvailable); + Assert.Equal(2, result.Events.Count); + Assert.Equal(older, result.Events[0].EventId); + Assert.Equal(newer, result.Events[1].EventId); + } + + [Fact] + public async Task PullAsync_returns_empty_when_site_endpoint_is_unknown() + { + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(), // no sites registered + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + Assert.Equal(0, invoker.CallCount); // never dialled — nothing to dial + } + + [Theory] + [InlineData(StatusCode.Unavailable)] // connection refused / site offline + [InlineData(StatusCode.DeadlineExceeded)] // slow site / network blip + [InlineData(StatusCode.Cancelled)] + public async Task PullAsync_swallows_tolerable_transport_faults_to_empty_response(StatusCode code) + { + var invoker = FakeInvoker.Throwing(new RpcException(new Status(code, "transport fault"))); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // MUST NOT throw — per the IPullAuditEventsClient contract. + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_connection_layer_faults_to_empty_response() + { + // A bare HttpRequestException (e.g. DNS / refused socket before a gRPC + // status is established) is also tolerable. + var invoker = FakeInvoker.Throwing(new HttpRequestException("connection refused")); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_unexpected_faults_to_empty_response() + { + // I3(a): the catch-all path. A non-transport fault (e.g. a mapping/ + // protocol error surfacing as InvalidOperationException) must still be + // swallowed to empty — audit reconciliation is best-effort and a throw + // would only get re-caught by the actor's per-site guard. + var invoker = FakeInvoker.Throwing(new InvalidOperationException("boom")); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_with_minvalue_unspecified_cursor_does_not_throw_and_dials() + { + // I3(b) / guards I2: the reconciliation cursor starts at DateTime.MinValue + // with Kind=Unspecified. EnsureUtc must treat it AS UTC (per the system-wide + // "all timestamps are UTC" invariant) and NOT call ToUniversalTime() — on a + // host with a positive UTC offset that underflows and Timestamp.FromDateTime + // throws ArgumentOutOfRangeException, crashing the FIRST pull for every site. + var minUnspecified = default(DateTime); // DateTime.MinValue, Kind=Unspecified + Assert.Equal(DateTimeKind.Unspecified, minUnspecified.Kind); + + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullAuditEventsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // MUST NOT throw — must dial successfully. + var result = await sut.PullAsync("site-a", minUnspecified, batchSize: 256, CancellationToken.None); + + Assert.Equal(1, invoker.CallCount); + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + // The unspecified-MinValue cursor is carried through verbatim as UTC + // MinValue (no local-TZ conversion). + Assert.Equal(DateTime.MinValue, invoker.Request!.SinceUtc.ToDateTime()); + Assert.Empty(result.Events); + Assert.False(result.MoreAvailable); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs new file mode 100644 index 00000000..650b4a15 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/GrpcPullSiteCallsClientTests.cs @@ -0,0 +1,251 @@ +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; +using ProtoPullRequest = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsRequest; +using ProtoPullResponse = ZB.MOM.WW.ScadaBridge.Communication.Grpc.PullSiteCallsResponse; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Tests for — the production +/// that dials a site over gRPC and issues the +/// PullSiteCalls unary RPC for the Site Call Audit (#22) reconciliation +/// loop. The real GrpcChannel is replaced by an injected +/// seam so the +/// client's mapping / ordering / SourceSite-restamp / fault-swallowing behaviour +/// can be asserted without standing up a Kestrel HTTP/2 endpoint. Mirrors +/// . +/// +public class GrpcPullSiteCallsClientTests +{ + private static readonly DateTime BaseTime = + new(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + private sealed class FakeInvoker : GrpcPullSiteCallsClient.IPullSiteCallsInvoker + { + public string? Endpoint { get; private set; } + public ProtoPullRequest? Request { get; private set; } + public int CallCount { get; private set; } + + private readonly ProtoPullResponse? _response; + private readonly Exception? _throw; + + private FakeInvoker(ProtoPullResponse? response, Exception? toThrow) + { + _response = response; + _throw = toThrow; + } + + public static FakeInvoker Returning(ProtoPullResponse response) => new(response, null); + public static FakeInvoker Throwing(Exception ex) => new(null, ex); + + public Task InvokeAsync( + string endpoint, ProtoPullRequest request, CancellationToken ct) + { + CallCount++; + Endpoint = endpoint; + Request = request; + if (_throw is not null) + { + throw _throw; + } + return Task.FromResult(_response!); + } + } + + // The site leaves SourceSite empty (it is not a tracking-store column); the + // client re-stamps it from the dialed siteId. Mint DTOs with empty SourceSite + // to prove that re-stamp. + private static SiteCallOperationalDto Dto(Guid id, DateTime updatedAtUtc) => + new() + { + TrackedOperationId = id.ToString(), + Channel = "ApiOutbound", + Target = "ERP.GetOrder", + SourceSite = string.Empty, + SourceNode = "node-a", + Status = "Attempted", + RetryCount = 1, + LastError = string.Empty, + CreatedAtUtc = Timestamp.FromDateTime(BaseTime), + UpdatedAtUtc = Timestamp.FromDateTime(updatedAtUtc), + }; + + [Fact] + public async Task PullAsync_dials_resolved_endpoint_maps_oldest_first_and_restamps_source_site() + { + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + // Wire delivered newest-first on purpose to prove the client sorts. + var proto = new ProtoPullResponse { MoreAvailable = true }; + proto.Operationals.Add(Dto(newer, BaseTime.AddMinutes(5))); + proto.Operationals.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + // Endpoint resolution + request shaping. + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(256, invoker.Request!.BatchSize); + Assert.Equal(BaseTime, invoker.Request.SinceUtc.ToDateTime()); + + // Mapping + ordering + MoreAvailable surface. + Assert.True(result.MoreAvailable); + Assert.Equal(2, result.SiteCalls.Count); + Assert.Equal(older, result.SiteCalls[0].TrackedOperationId.Value); + Assert.Equal(newer, result.SiteCalls[1].TrackedOperationId.Value); + + // SourceSite re-stamped from the dialed siteId (DTO carried empty). + Assert.Equal("site-a", result.SiteCalls[0].SourceSite); + Assert.Equal("site-a", result.SiteCalls[1].SourceSite); + + // Round-tripped fields survive FromDto. + Assert.Equal("ApiOutbound", result.SiteCalls[0].Channel); + Assert.Equal("node-a", result.SiteCalls[0].SourceNode); + Assert.Equal(1, result.SiteCalls[0].RetryCount); + } + + [Fact] + public async Task PullAsync_returns_empty_when_site_endpoint_is_unknown() + { + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(), // no sites registered + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + Assert.Equal(0, invoker.CallCount); // never dialled — nothing to dial + } + + [Theory] + [InlineData(StatusCode.Unavailable)] + [InlineData(StatusCode.DeadlineExceeded)] + [InlineData(StatusCode.Cancelled)] + public async Task PullAsync_swallows_tolerable_transport_faults_to_empty_response(StatusCode code) + { + var invoker = FakeInvoker.Throwing(new RpcException(new Status(code, "transport fault"))); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_connection_layer_faults_to_empty_response() + { + var invoker = FakeInvoker.Throwing(new HttpRequestException("connection refused")); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_swallows_unexpected_faults_to_empty_response() + { + var invoker = FakeInvoker.Throwing(new InvalidOperationException("boom")); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_skips_poison_row_and_returns_the_good_rows() + { + // Poison-row resilience: one malformed operational (an unparseable + // TrackedOperationId fails SiteCallDtoMapper.FromDto → Guid.Parse) must be + // skipped+logged PER ROW rather than sinking the whole batch through the + // outer catch-all. The two good rows survive, re-stamped + oldest-first. + var older = Guid.NewGuid(); + var newer = Guid.NewGuid(); + + var proto = new ProtoPullResponse { MoreAvailable = false }; + proto.Operationals.Add(Dto(newer, BaseTime.AddMinutes(5))); + // Malformed row in the middle of the batch. + var bad = Dto(Guid.NewGuid(), BaseTime.AddMinutes(2)); + bad.TrackedOperationId = "not-a-guid"; + proto.Operationals.Add(bad); + proto.Operationals.Add(Dto(older, BaseTime)); + + var invoker = FakeInvoker.Returning(proto); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + // Must NOT throw — the bad row is dropped, the good rows are returned. + var result = await sut.PullAsync("site-a", BaseTime, batchSize: 256, CancellationToken.None); + + Assert.Equal(2, result.SiteCalls.Count); + // Survivors are oldest-first and SourceSite re-stamped from the dialed siteId. + Assert.Equal(older, result.SiteCalls[0].TrackedOperationId.Value); + Assert.Equal(newer, result.SiteCalls[1].TrackedOperationId.Value); + Assert.Equal("site-a", result.SiteCalls[0].SourceSite); + Assert.Equal("site-a", result.SiteCalls[1].SourceSite); + Assert.False(result.MoreAvailable); + } + + [Fact] + public async Task PullAsync_with_minvalue_unspecified_cursor_does_not_throw_and_dials() + { + // The reconciliation cursor starts at DateTime.MinValue with + // Kind=Unspecified. EnsureUtc must treat it AS UTC (per the system-wide + // invariant) and NOT call ToUniversalTime() — on a host with a positive + // UTC offset that underflows and Timestamp.FromDateTime throws, crashing + // the FIRST pull for every site. + var minUnspecified = default(DateTime); + Assert.Equal(DateTimeKind.Unspecified, minUnspecified.Kind); + + var invoker = FakeInvoker.Returning(new ProtoPullResponse()); + var sut = new GrpcPullSiteCallsClient( + new StaticEnumerator(new SiteEntry("site-a", "http://site-a:8083")), + invoker, + NullLogger.Instance); + + var result = await sut.PullAsync("site-a", minUnspecified, batchSize: 256, CancellationToken.None); + + Assert.Equal(1, invoker.CallCount); + Assert.Equal("http://site-a:8083", invoker.Endpoint); + Assert.NotNull(invoker.Request); + Assert.Equal(DateTime.MinValue, invoker.Request!.SinceUtc.ToDateTime()); + Assert.Empty(result.SiteCalls); + Assert.False(result.MoreAvailable); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs new file mode 100644 index 00000000..d5a8951b --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/SiteEnumeratorTests.cs @@ -0,0 +1,91 @@ +using Microsoft.Extensions.DependencyInjection; +using NSubstitute; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using SiteEntity = ZB.MOM.WW.ScadaBridge.Commons.Entities.Sites.Site; + +namespace ZB.MOM.WW.ScadaBridge.AuditLog.Tests.Central; + +/// +/// Unit tests for the production — the central +/// reconciliation-singleton collaborator that projects the config-DB +/// rows into the targets the +/// polls. +/// +/// +/// The enumerator opens a fresh DI scope per +/// call (mirroring the per-tick scope pattern in the reconciliation actor) +/// because is a SCOPED EF Core service. The tests +/// register a substituted repository as a scoped service so the enumerator's +/// CreateAsyncScope resolves it and the projection / blank-address +/// filtering can be exercised without an MSSQL container. +/// +public class SiteEnumeratorTests +{ + private static SiteEntity SiteWith(string identifier, string? grpcNodeA, string? grpcNodeB = null) + { + var site = new SiteEntity($"Display {identifier}", identifier) + { + GrpcNodeAAddress = grpcNodeA, + GrpcNodeBAddress = grpcNodeB, + }; + return site; + } + + private static IServiceProvider BuildProvider(ISiteRepository repository) + { + var services = new ServiceCollection(); + // Scoped to match the production lifetime (EF Core); the enumerator + // must open a scope to resolve it. + services.AddScoped(_ => repository); + return services.BuildServiceProvider(); + } + + [Fact] + public async Task EnumerateAsync_ProjectsSitesWithNodeAAddress_AndSkipsBlankOnes() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List + { + SiteWith("site-a", "http://site-a:8083"), + SiteWith("site-b", grpcNodeA: " "), // blank NodeA -> skipped + }); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + var entry = Assert.Single(result); + Assert.Equal("site-a", entry.SiteId); + Assert.Equal("http://site-a:8083", entry.GrpcEndpoint); + } + + [Fact] + public async Task EnumerateAsync_SkipsNullNodeAAddress() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List + { + SiteWith("site-null", grpcNodeA: null), + }); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + Assert.Empty(result); + } + + [Fact] + public async Task EnumerateAsync_ReturnsEmpty_WhenNoSites() + { + var repository = Substitute.For(); + repository.GetAllSitesAsync(Arg.Any()).Returns(new List()); + + var enumerator = new SiteEnumerator(BuildProvider(repository)); + + var result = await enumerator.EnumerateAsync(); + + Assert.Empty(result); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs new file mode 100644 index 00000000..45104a84 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.Communication.Tests/SiteStreamPullSiteCallsTests.cs @@ -0,0 +1,221 @@ +using Akka.TestKit.Xunit2; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; +using NSubstitute.ExceptionExtensions; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Communication.Grpc; + +namespace ZB.MOM.WW.ScadaBridge.Communication.Tests; + +/// +/// Tests for (Site Call Audit +/// #22 reconciliation handler). Verifies the request → +/// → response +/// round-trip through the gRPC handler. The store is an NSubstitute stub so the +/// tests never touch SQLite. Mirrors +/// — but there is no MarkReconciled step (the tracking store is the operational +/// source of truth; the central SiteCalls mirror is upsert-on-newer). +/// +public class SiteStreamPullSiteCallsTests : TestKit +{ + private readonly ISiteStreamSubscriber _subscriber = Substitute.For(); + + private SiteStreamGrpcServer CreateServer() => + new(_subscriber, NullLogger.Instance); + + private static ServerCallContext NewContext(CancellationToken ct = default) + { + var context = Substitute.For(); + context.CancellationToken.Returns(ct); + return context; + } + + private static SiteCallOperational NewOperational() => + new( + TrackedOperationId: TrackedOperationId.New(), + Channel: "ApiOutbound", + Target: "ERP.GetOrder", + SourceSite: string.Empty, + SourceNode: "node-a", + Status: "Attempted", + RetryCount: 1, + LastError: null, + HttpStatus: 503, + CreatedAtUtc: DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 0, 0), DateTimeKind.Utc), + UpdatedAtUtc: DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 1, 0), DateTimeKind.Utc), + TerminalAtUtc: null); + + [Fact] + public async Task PullSiteCalls_NoStoreWired_ReturnsEmptyResponse() + { + var server = CreateServer(); + // Intentionally do NOT call SetOperationTrackingStore — simulates a + // central-only host or a wiring-incomplete startup window. + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddMinutes(-5)), + BatchSize = 100, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + } + + [Fact] + public async Task PullSiteCalls_With5Rows_ReturnsAllFiveDtos() + { + var store = Substitute.For(); + var rows = Enumerable.Range(0, 5).Select(_ => NewOperational()).ToList(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)rows); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, // larger than returned count so MoreAvailable should be false + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Equal(5, response.Operationals.Count); + Assert.False(response.MoreAvailable); // 5 < 100 + var expectedIds = rows.Select(r => r.TrackedOperationId.ToString()).ToHashSet(); + Assert.True(expectedIds.SetEquals(response.Operationals.Select(d => d.TrackedOperationId).ToHashSet())); + } + + [Fact] + public async Task PullSiteCalls_PassesSinceUtcThroughVerbatim() + { + var store = Substitute.For(); + var capturedSince = DateTime.MinValue; + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + capturedSince = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var since = DateTime.SpecifyKind(new DateTime(2026, 5, 20, 9, 30, 0), DateTimeKind.Utc); + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(since), + BatchSize = 50, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + Assert.Equal(since, capturedSince); + } + + [Fact] + public async Task PullSiteCalls_SinceUtcUnset_PassesDateTimeMinValue() + { + // First reconciliation cycle: central has no cursor yet, so the request's + // SinceUtc wrapper is absent (null). The handler must default to + // DateTime.MinValue ("pull from the beginning of recorded history") + // without a null-deref — this proves the very first cycle doesn't crash. + var store = Substitute.For(); + var captured = new DateTime(2099, 1, 1, 0, 0, 0, DateTimeKind.Utc); // sentinel + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + captured = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + // SinceUtc intentionally left unset (null) — the proto wrapper is absent. + var request = new PullSiteCallsRequest + { + BatchSize = 100, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + Assert.Equal(DateTime.MinValue, captured); + } + + [Fact] + public async Task PullSiteCalls_BatchSize3_Returns3Rows_MoreAvailableTrue() + { + var store = Substitute.For(); + var rows = Enumerable.Range(0, 3).Select(_ => NewOperational()).ToList(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)rows); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 3, + }; + + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Equal(3, response.Operationals.Count); + // saturated batch → central needs to know to issue a follow-up pull + Assert.True(response.MoreAvailable); + } + + [Fact] + public async Task PullSiteCalls_NonPositiveBatchSize_ThrowsInvalidArgument() + { + var store = Substitute.For(); + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 0, + }; + + var ex = await Assert.ThrowsAsync( + () => server.PullSiteCalls(request, NewContext())); + Assert.Equal(StatusCode.InvalidArgument, ex.StatusCode); + } + + [Fact] + public async Task PullSiteCalls_ReadThrows_ReturnsEmptyResponse() + { + // Best-effort: a read fault must never abort the reconciliation tick. + var store = Substitute.For(); + store.ReadChangedSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .ThrowsAsync(new InvalidOperationException("SQLite disposed mid-call")); + + var server = CreateServer(); + server.SetOperationTrackingStore(store); + + var request = new PullSiteCallsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, + }; + + // Must NOT throw — the handler swallows the fault to an empty response. + var response = await server.PullSiteCalls(request, NewContext()); + + Assert.Empty(response.Operationals); + Assert.False(response.MoreAvailable); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs index 83d32963..8d4bceb1 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/ActorPathTests.cs @@ -117,6 +117,22 @@ public class CentralActorPathTests : IAsyncLifetime public async Task CentralActors_NotificationOutboxProxy_Exists() => await AssertActorExists("/user/notification-outbox-proxy"); + [Fact] + public async Task CentralActors_AuditLogPurgeSingleton_Exists() + => await AssertActorExists("/user/audit-log-purge-singleton"); + + [Fact] + public async Task CentralActors_AuditLogPurgeProxy_Exists() + => await AssertActorExists("/user/audit-log-purge-proxy"); + + [Fact] + public async Task CentralActors_SiteAuditReconciliationSingleton_Exists() + => await AssertActorExists("/user/site-audit-reconciliation-singleton"); + + [Fact] + public async Task CentralActors_SiteAuditReconciliationProxy_Exists() + => await AssertActorExists("/user/site-audit-reconciliation-proxy"); + private async Task AssertActorExists(string path) { Assert.NotNull(_actorSystem); diff --git a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs index f0f101f7..7855d876 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.Host.Tests/AkkaHostedServiceAuditWiringTests.cs @@ -7,6 +7,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Options; using ZB.MOM.WW.ScadaBridge.AuditLog; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.ScadaBridge.AuditLog.Site; using ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry; using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure; @@ -238,6 +239,36 @@ public class CentralAuditWiringTests : IDisposable Assert.NotNull(forwarder); Assert.IsType(forwarder); } + + /// + /// I4 (review): the central composition root must register the production + /// reconciliation collaborators via + /// AddAuditLogCentralReconciliationClient. Asserting the concrete + /// implementations resolve here is a faster, clearer signal than a runtime + /// "actor not found" / cryptic GetRequiredService throw in + /// AkkaHostedService.RegisterCentralActors if that helper is ever + /// dropped from Program.cs. + /// + [Fact] + public void Central_Resolves_ISiteEnumerator_AsSiteEnumerator() + { + var enumerator = _factory.Services.GetService(); + Assert.NotNull(enumerator); + Assert.IsType(enumerator); + } + + /// + /// I4 (review): companion to + /// — the production gRPC pull client must resolve on the central composition + /// root so the SiteAuditReconciliationActor singleton can dial sites. + /// + [Fact] + public void Central_Resolves_IPullAuditEventsClient_AsGrpcClient() + { + var client = _factory.Services.GetService(); + Assert.NotNull(client); + Assert.IsType(client); + } } /// diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs index e9e4d950..703b366c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditOptionsTests.cs @@ -11,5 +11,108 @@ public class SiteCallAuditOptionsTests Assert.Equal(TimeSpan.FromMinutes(10), options.StuckAgeThreshold); // KPI interval mirrors NotificationOutboxOptions.DeliveredKpiWindow. Assert.Equal(TimeSpan.FromMinutes(1), options.KpiInterval); + + // Reconciliation tick cadence mirrors SiteAuditReconciliationOptions (#23). + Assert.Equal(TimeSpan.FromMinutes(5), options.ReconciliationInterval); + // Purge tick cadence mirrors AuditLogPurgeOptions. + Assert.Equal(TimeSpan.FromHours(24), options.PurgeInterval); + // Retention window mirrors the central audit-store retention policy. + Assert.Equal(365, options.RetentionDays); + } + + [Fact] + public void ResolvedReconciliationInterval_DefaultsToConfiguredValue() + { + var options = new SiteCallAuditOptions(); + + Assert.Equal(options.ReconciliationInterval, options.ResolvedReconciliationInterval); + } + + [Theory] + [InlineData(0)] + [InlineData(-5)] + public void ResolvedReconciliationInterval_ClampsZeroOrNegativeToMinimum(int configuredSeconds) + { + // A misconfigured 0 / negative interval must never resolve to TimeSpan.Zero + // (which would make Akka's ScheduleTellRepeatedlyCancelable spin). The + // documented floor is >= 1 second. + var options = new SiteCallAuditOptions + { + ReconciliationInterval = TimeSpan.FromSeconds(configuredSeconds), + }; + + Assert.True( + options.ResolvedReconciliationInterval >= TimeSpan.FromSeconds(1), + $"expected the resolved interval to clamp to >= 1s, got {options.ResolvedReconciliationInterval}"); + Assert.Equal(TimeSpan.FromSeconds(1), options.ResolvedReconciliationInterval); + } + + [Fact] + public void ResolvedReconciliationInterval_OverrideBypassesClamp() + { + // The test-only override drops the cadence below the clamp floor so unit + // tests can run the tick at millisecond cadence. + var sub1Second = TimeSpan.FromMilliseconds(50); + var options = new SiteCallAuditOptions + { + ReconciliationInterval = TimeSpan.FromMinutes(5), + ReconciliationIntervalOverride = sub1Second, + }; + + Assert.Equal(sub1Second, options.ResolvedReconciliationInterval); + } + + [Fact] + public void ResolvedPurgeInterval_DefaultsToConfiguredValue() + { + var options = new SiteCallAuditOptions(); + + Assert.Equal(options.PurgeInterval, options.ResolvedPurgeInterval); + } + + [Theory] + [InlineData(0)] + [InlineData(-30)] + public void ResolvedPurgeInterval_ClampsZeroOrNegativeToMinimum(int configuredSeconds) + { + // A misconfigured 0 / negative purge interval clamps to the documented + // >= 1 minute floor (the purge is daily, so a more generous floor than + // the reconciliation tick). + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromSeconds(configuredSeconds), + }; + + Assert.True( + options.ResolvedPurgeInterval >= TimeSpan.FromMinutes(1), + $"expected the resolved interval to clamp to >= 1min, got {options.ResolvedPurgeInterval}"); + Assert.Equal(TimeSpan.FromMinutes(1), options.ResolvedPurgeInterval); + } + + [Fact] + public void ResolvedPurgeInterval_BelowMinuteFloorClampsToMinimum() + { + // A positive-but-sub-minute config value still clamps to the 1-minute floor. + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromSeconds(5), + }; + + Assert.Equal(TimeSpan.FromMinutes(1), options.ResolvedPurgeInterval); + } + + [Fact] + public void ResolvedPurgeInterval_OverrideBypassesClamp() + { + // The test-only override drops the cadence below the clamp floor so unit + // tests can run the purge tick at millisecond cadence. + var subMinute = TimeSpan.FromMilliseconds(50); + var options = new SiteCallAuditOptions + { + PurgeInterval = TimeSpan.FromHours(24), + PurgeIntervalOverride = subMinute, + }; + + Assert.Equal(subMinute, options.ResolvedPurgeInterval); } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs new file mode 100644 index 00000000..6352ddec --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditPurgeTests.cs @@ -0,0 +1,175 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; + +namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests; + +/// +/// Purge-scheduler tests for (#22, Piece B). +/// Exercises the daily terminal-row purge tick in-memory — a recording +/// captures the +/// threshold the actor +/// computes, with no live MSSQL fixture. The reconciliation collaborators are +/// inert stubs (the purge tick doesn't use them, but they must be present to +/// arm the scheduler — both timers gate on the collaborators together). +/// +public class SiteCallAuditPurgeTests : TestKit +{ + private static SiteCallAuditOptions FastPurgeOptions(int retentionDays = 365) => new() + { + // Keep the reconciliation tick slow so it doesn't fight the purge tick + // for the test window; drop the purge tick to 100 ms via its override. + ReconciliationIntervalOverride = TimeSpan.FromMinutes(5), + PurgeIntervalOverride = TimeSpan.FromMilliseconds(100), + RetentionDays = retentionDays, + }; + + /// Empty enumerator — the purge path never touches it, but it must be present to arm the scheduler. + private sealed class EmptyEnumerator : ISiteEnumerator + { + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// No-op pull client — present only to arm the scheduler. + private sealed class NoOpPullClient : IPullSiteCallsClient + { + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) => + Task.FromResult(new PullSiteCallsResponse(Array.Empty(), MoreAvailable: false)); + } + + /// + /// Recording repository capturing every + /// threshold (and the configured deleted-row count it returns). + /// + private sealed class RecordingRepo : ISiteCallAuditRepository + { + public List PurgeThresholds { get; } = new(); + public int RowsDeletedPerCall { get; set; } + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) + { + PurgeThresholds.Add(olderThanUtc); + return Task.FromResult(RowsDeletedPerCall); + } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) => Task.CompletedTask; + + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => + Task.FromResult(null); + + public Task> QueryAsync( + SiteCallQueryFilter filter, SiteCallPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task ComputeKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + + public Task> ComputePerSiteKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// Repository whose purge always throws — to prove continue-on-error keeps the singleton alive. + private sealed class PurgeThrowingRepo : ISiteCallAuditRepository + { + public int PurgeCallCount; + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) + { + Interlocked.Increment(ref PurgeCallCount); + throw new InvalidOperationException("simulated purge failure"); + } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) => Task.CompletedTask; + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => Task.FromResult(null); + public Task> QueryAsync(SiteCallQueryFilter f, SiteCallPaging p, CancellationToken ct = default) => Task.FromResult>(Array.Empty()); + public Task ComputeKpisAsync(DateTime a, DateTime b, CancellationToken ct = default) => Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + public Task> ComputePerSiteKpisAsync(DateTime a, DateTime b, CancellationToken ct = default) => Task.FromResult>(Array.Empty()); + } + + private IActorRef CreateActor(ISiteCallAuditRepository repo, SiteCallAuditOptions options) => + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + new EmptyEnumerator(), + new NoOpPullClient(), + NullLogger.Instance, + options))); + + // --------------------------------------------------------------------- + // 1. PurgeTick_CallsPurgeTerminal_WithRetentionThreshold + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_CallsPurgeTerminalAsync_WithRetentionThreshold() + { + var repo = new RecordingRepo { RowsDeletedPerCall = 7 }; + // Non-default retention (30 days) so the assertion isn't accidentally + // satisfied by the 365-day default. + CreateActor(repo, FastPurgeOptions(retentionDays: 30)); + + AwaitAssert( + () => Assert.True(repo.PurgeThresholds.Count >= 1, + $"expected >= 1 PurgeTerminalAsync call, got {repo.PurgeThresholds.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // The threshold the actor passed must be ~UtcNow - 30 days. 1-minute + // slack covers scheduling jitter between the tick firing and the assert. + var threshold = repo.PurgeThresholds[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(30); + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"purge threshold {threshold:o} should be within 1 minute of {expected:o}"); + } + + // --------------------------------------------------------------------- + // 2. PurgeTick_UsesDefaultRetention_365Days + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_DefaultRetention_Uses365DayThreshold() + { + var repo = new RecordingRepo(); + CreateActor(repo, FastPurgeOptions()); // default 365 days + + AwaitAssert( + () => Assert.True(repo.PurgeThresholds.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + var threshold = repo.PurgeThresholds[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(365); + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"purge threshold {threshold:o} should be within 1 minute of {expected:o}"); + } + + // --------------------------------------------------------------------- + // 3. PurgeTick_RepoThrows_ActorStaysAlive_RetriesNextTick (continue-on-error) + // --------------------------------------------------------------------- + + [Fact] + public void PurgeTick_PurgeThrows_ActorStaysAlive_RetriesNextTick() + { + var repo = new PurgeThrowingRepo(); + CreateActor(repo, FastPurgeOptions()); + + // The singleton must NOT die on a purge fault — a second tick must still + // arrive (continue-on-error). Two purge calls prove the actor survived + // the first throw and the timer kept ticking. + AwaitAssert( + () => Assert.True(repo.PurgeCallCount >= 2, + $"expected >= 2 purge attempts (actor survived the throw), got {repo.PurgeCallCount}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs new file mode 100644 index 00000000..ac2f86b0 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests/SiteCallAuditReconciliationTests.cs @@ -0,0 +1,300 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.AuditLog.Central; +using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit; +using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; +using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration; +using ZB.MOM.WW.ScadaBridge.Commons.Types; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit; + +namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit.Tests; + +/// +/// Reconciliation-tick tests for (#22, Piece A). +/// These exercise the periodic per-site self-heal pull entirely in-memory — +/// fake + + a +/// recording — so they run in +/// milliseconds and do NOT depend on a live MSSQL fixture (unlike the +/// MSSQL-backed ). The actor is built via +/// the internal test ctor that injects all three collaborators; the +/// repo-only test ctor used by the MSSQL tests passes no client/enumerator, so +/// the reconciliation tick is gated off there (see +/// ). +/// +public class SiteCallAuditReconciliationTests : TestKit +{ + private static SiteCall NewRow( + TrackedOperationId id, + string sourceSite, + string status = "Submitted", + DateTime? updatedAtUtc = null) + { + var now = updatedAtUtc ?? DateTime.UtcNow; + return new SiteCall + { + TrackedOperationId = id, + Channel = "ApiOutbound", + Target = "ERP.GetOrder", + SourceSite = sourceSite, + SourceNode = null, + Status = status, + RetryCount = 0, + LastError = null, + HttpStatus = null, + CreatedAtUtc = now, + UpdatedAtUtc = now, + TerminalAtUtc = null, + IngestedAtUtc = now, + }; + } + + private static SiteCallAuditOptions FastTickOptions(int batchSize = 500) => new() + { + // 100 ms tick keeps each test under a second; AwaitAssert covers + // scheduler jitter so the tick has up to a few seconds to fire. + ReconciliationInterval = TimeSpan.FromMinutes(5), + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + ReconciliationBatchSize = batchSize, + }; + + /// In-memory enumerator returning a static list of sites. + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Scripted pull client — returns the next queued response for the site on + /// each call (looping the last entry once exhausted) and records every + /// invocation so tests can assert call counts + the since cursor. + /// + private sealed class ScriptedPullClient : IPullSiteCallsClient + { + public List<(string SiteId, DateTime SinceUtc, int BatchSize)> Calls { get; } = new(); + private readonly Dictionary> _scripted = new(); + private readonly Dictionary _throwOnSite = new(); + + public ScriptedPullClient Script(string siteId, params PullSiteCallsResponse[] responses) + { + _scripted[siteId] = new Queue(responses); + return this; + } + + public ScriptedPullClient ThrowFor(string siteId, Exception ex) + { + _throwOnSite[siteId] = ex; + return this; + } + + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + Calls.Add((siteId, sinceUtc, batchSize)); + if (_throwOnSite.TryGetValue(siteId, out var ex)) + { + throw ex; + } + if (_scripted.TryGetValue(siteId, out var queue) && queue.Count > 0) + { + return Task.FromResult(queue.Dequeue()); + } + return Task.FromResult( + new PullSiteCallsResponse(Array.Empty(), MoreAvailable: false)); + } + } + + /// + /// Recording repository that captures every call + /// (keyed by id, last-write-wins on the captured row). The reconciliation + /// tick only ever calls ; the read/KPI members are + /// inert stubs. + /// + private sealed class RecordingRepo : ISiteCallAuditRepository + { + public Dictionary Upserted { get; } = new(); + public int UpsertCallCount { get; private set; } + + public Task UpsertAsync(SiteCall siteCall, CancellationToken ct = default) + { + UpsertCallCount++; + Upserted[siteCall.TrackedOperationId] = siteCall; + return Task.CompletedTask; + } + + public Task GetAsync(TrackedOperationId id, CancellationToken ct = default) => + Task.FromResult(Upserted.TryGetValue(id, out var row) ? row : null); + + public Task> QueryAsync( + SiteCallQueryFilter filter, SiteCallPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task PurgeTerminalAsync(DateTime olderThanUtc, CancellationToken ct = default) => + Task.FromResult(0); + + public Task ComputeKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult(new SiteCallKpiSnapshot(0, 0, 0, 0, null, 0)); + + public Task> ComputePerSiteKpisAsync( + DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + private IActorRef CreateActor( + ISiteEnumerator sites, + IPullSiteCallsClient client, + ISiteCallAuditRepository repo, + SiteCallAuditOptions options) => + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + sites, + client, + NullLogger.Instance, + options))); + + // --------------------------------------------------------------------- + // 1. AbsentRow_PulledFromSite_IsUpserted + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_AbsentRow_IsUpsertedFromSitePull() + { + var siteId = "siteA"; + var id = TrackedOperationId.New(); + var row = NewRow(id, sourceSite: siteId, status: "Parked"); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + var client = new ScriptedPullClient().Script(siteId, + new PullSiteCallsResponse(new[] { row }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => + { + Assert.True(repo.Upserted.ContainsKey(id), + "reconciliation tick should upsert the row present at the site but absent centrally"); + Assert.Equal("Parked", repo.Upserted[id].Status); + Assert.Equal(siteId, repo.Upserted[id].SourceSite); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Cursor_Advances_ToMaxUpdatedAtUtc_NoRePullOfOldRows + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_SecondTick_AdvancesCursorPastAlreadyPulledRows() + { + var siteId = "siteA"; + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + var r1 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t1); + var r2 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t2); + var r3 = NewRow(TrackedOperationId.New(), siteId, updatedAtUtc: t3); + + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://siteA:8083")); + // First pull returns three rows (max UpdatedAtUtc = t3); subsequent + // pulls return empty. The second pull's `since` must be t3, proving the + // cursor advanced and old rows are not re-pulled from the start. + var client = new ScriptedPullClient().Script(siteId, + new PullSiteCallsResponse(new[] { r1, r2, r3 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => Assert.True(client.Calls.Count >= 2, + $"need at least 2 pulls to assert cursor advancement, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(5), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Equal(DateTime.MinValue, client.Calls[0].SinceUtc); + Assert.Equal(t3, client.Calls[1].SinceUtc); + // The batch size flows through from options. + Assert.Equal(500, client.Calls[0].BatchSize); + } + + // --------------------------------------------------------------------- + // 3. OneSiteThrows_OtherSitesStillProcessed (failure isolation) + // --------------------------------------------------------------------- + + [Fact] + public void ReconciliationTick_OneSiteThrows_OtherSitesStillReconciled() + { + var siteB = "siteB"; + var bId = TrackedOperationId.New(); + var bRow = NewRow(bId, sourceSite: siteB, status: "Delivered"); + + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry(siteB, "http://siteB:8083")); + var client = new ScriptedPullClient() + .ThrowFor("siteA", new InvalidOperationException("simulated transport failure")) + .Script(siteB, new PullSiteCallsResponse(new[] { bRow }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert( + () => + { + // siteA was attempted (and threw) yet siteB's row still landed — + // one offline site must not sink the rest of the tick. + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.True(repo.Upserted.ContainsKey(bId), + "siteB must be reconciled even though siteA threw"); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 4. RepoOnly test ctor does NOT start the reconciliation tick + // --------------------------------------------------------------------- + + [Fact] + public void TestCtor_RepositoryOnly_DoesNotStartReconciliationTick() + { + // The repo-only test ctor (used by the MSSQL-backed actor tests) injects + // no client/enumerator, so the tick must be gated OFF — otherwise those + // tests would fire phantom pulls. Build the actor via that ctor and + // confirm no pull ever happens. We can't observe a non-event directly, + // so we share a ScriptedPullClient with an isolated actor that DOES run + // the tick to bound the wait, then assert the repo-only actor's client + // (a separate instance) recorded nothing. + var repo = new RecordingRepo(); + Sys.ActorOf(Props.Create(() => new SiteCallAuditActor( + repo, + NullLogger.Instance, + FastTickOptions()))); + + // Run a parallel actor with the full reconciliation ctor and a fast + // tick; once IT has pulled we know enough wall-clock elapsed that the + // repo-only actor would have ticked too, had it been wired. + var liveClient = new ScriptedPullClient(); + var liveRepo = new RecordingRepo(); + CreateActor( + new StaticEnumerator(new SiteEntry("siteX", "http://siteX:8083")), + liveClient, + liveRepo, + FastTickOptions()); + + AwaitAssert( + () => Assert.True(liveClient.Calls.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + // The repo-only actor never reconciles: it has no client to pull with, + // so it upserts nothing on its own. + Assert.Equal(0, repo.UpsertCallCount); + } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs index 150122a3..d7ee865d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/AlarmActorTests.cs @@ -7,6 +7,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -877,6 +878,112 @@ public class AlarmActorTests : TestKit, IDisposable Assert.Equal(AlarmLevel.HighHigh, escalated.Level); } + // ── M1.5: site event log `alarm` category ────────────────────────────── + + [Fact] + public void AlarmActor_Raise_EmitsAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "HighTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Critical\"}", + PriorityLevel = 800 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "HighTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Critical", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + // Background fire-and-forget; allow it to land. + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + var row = rows[0]; + Assert.Equal("Error", row.Severity); // priority 800 → Error + Assert.Equal("Pump1", row.InstanceId); + Assert.Equal("AlarmActor:HighTemp", row.Source); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void AlarmActor_RaiseLowPriority_EmitsWarningAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "MinorTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Warn\"}", + PriorityLevel = 100 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "MinorTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Warn", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + Assert.Equal("Warning", rows[0].Severity); // priority 100 → Warning + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void AlarmActor_Clear_EmitsInfoAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var sp = new SingleServiceProvider(siteLog); + + var alarmConfig = new ResolvedAlarm + { + CanonicalName = "HighTemp", + TriggerType = "ValueMatch", + TriggerConfiguration = "{\"attributeName\":\"Status\",\"matchValue\":\"Critical\"}", + PriorityLevel = 800 + }; + + var instanceProbe = CreateTestProbe(); + var alarm = ActorOf(Props.Create(() => new AlarmActor( + "HighTemp", "Pump1", instanceProbe.Ref, alarmConfig, + null, _sharedLibrary, _options, + NullLogger.Instance, null, null, null, sp))); + + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Critical", "Good", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + alarm.Tell(new AttributeValueChanged( + "Pump1", "Status", "Status", "Normal", "Critical", DateTimeOffset.UtcNow)); + instanceProbe.ExpectMsg(TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + clear + Assert.Equal("Error", rows[0].Severity); + Assert.Equal("Info", rows[1].Severity); // clear → Info + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void AlarmActor_MalformedTriggerConfig_DoesNotCrash() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs index 2b2191ca..6b57e936 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerActorTests.cs @@ -10,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; using System.Text.Json; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -44,7 +45,8 @@ public class DeploymentManagerActorTests : TestKit, IDisposable try { File.Delete(_dbFile); } catch { /* cleanup */ } } - private IActorRef CreateDeploymentManager(SiteRuntimeOptions? options = null) + private IActorRef CreateDeploymentManager( + SiteRuntimeOptions? options = null, IServiceProvider? serviceProvider = null) { options ??= new SiteRuntimeOptions(); return ActorOf(Props.Create(() => new DeploymentManagerActor( @@ -53,7 +55,12 @@ public class DeploymentManagerActorTests : TestKit, IDisposable _sharedScriptLibrary, null, // no stream manager in tests options, - NullLogger.Instance))); + NullLogger.Instance, + null, + null, + null, + serviceProvider, + null))); } private static string MakeConfigJson(string instanceName) @@ -171,6 +178,70 @@ public class DeploymentManagerActorTests : TestKit, IDisposable Assert.Equal("NewPump", response.InstanceUniqueName); } + // ── M1.6: site event log `deployment` category ───────────────────────── + + [Fact] + public async Task DeploymentManager_DeploySuccess_EmitsDeploymentSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var actor = CreateDeploymentManager(serviceProvider: new SingleServiceProvider(siteLog)); + + await Task.Delay(500); // wait for empty startup + + actor.Tell(new DeployInstanceCommand( + "dep-evt-1", "AuditedPump", "sha256:xyz", + MakeConfigJson("AuditedPump"), "admin", DateTimeOffset.UtcNow)); + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(DeploymentStatus.Success, response.Status); + + AwaitAssert(() => + { + var rows = siteLog.OfType("deployment"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "AuditedPump" && + r.Source == "DeploymentManagerActor" && + r.Message.Contains("deploy", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public async Task DeploymentManager_DisableEnableDelete_EmitDeploymentSiteEvents() + { + var siteLog = new FakeSiteEventLogger(); + var actor = CreateDeploymentManager(serviceProvider: new SingleServiceProvider(siteLog)); + + await Task.Delay(500); + + actor.Tell(new DeployInstanceCommand( + "dep-evt-2", "EvtPump", "sha256:abc", + MakeConfigJson("EvtPump"), "admin", DateTimeOffset.UtcNow)); + ExpectMsg(TimeSpan.FromSeconds(5)); + await Task.Delay(1000); + + // The deployment site events are emitted fire-and-forget off the actor + // thread (LogDeploymentEvent runs in a ContinueWith continuation), so + // poll for each event with AwaitAssert rather than a bare Task.Delay — + // a fixed sleep is racy under CI load. + actor.Tell(new DisableInstanceCommand("cmd-de1", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); + + actor.Tell(new EnableInstanceCommand("cmd-en1", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); + + actor.Tell(new DeleteInstanceCommand("cmd-del-evt", "EvtPump", DateTimeOffset.UtcNow)); + Assert.True(ExpectMsg(TimeSpan.FromSeconds(5)).Success); + AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"), + r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase)), + TimeSpan.FromSeconds(2)); + } + [Fact] public async Task DeploymentManager_Lifecycle_DisableEnableDelete() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs index 8d28de6d..22fe87c6 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/ExecutionActorTests.cs @@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; using ZB.MOM.WW.ScadaBridge.Commons.Types.Scripts; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -71,6 +72,71 @@ public class ExecutionActorTests : TestKit, IDisposable ExpectTerminated(exec, TimeSpan.FromSeconds(5)); } + // ── M1.8: site event log `script` started/completed ──────────────────── + + [Fact] + public void ScriptExecutionActor_Success_EmitsScriptStartedAndCompletedInfoEvents() + { + var compiled = CompileScript("return 7 * 6;"); + var replyTo = CreateTestProbe(); + var instanceActor = CreateTestProbe(); + var siteLog = new FakeSiteEventLogger(); + + var exec = ActorOf(Props.Create(() => new ScriptExecutionActor( + "Answer", "Inst1", compiled, null, 0, + instanceActor.Ref, _sharedLibrary, Options(), + replyTo.Ref, "corr-evt-1", NullLogger.Instance, + ScriptScope.Root, null, new SingleServiceProvider(siteLog)))); + + Watch(exec); + replyTo.ExpectMsg(TimeSpan.FromSeconds(10)); + ExpectTerminated(exec, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("script"); + // started + completed, both Info, in order. + Assert.Equal(2, rows.Count); + Assert.All(rows, r => + { + Assert.Equal("Info", r.Severity); + Assert.Equal("Inst1", r.InstanceId); + Assert.Equal("ScriptActor:Answer", r.Source); + }); + Assert.Contains("started", rows[0].Message, StringComparison.OrdinalIgnoreCase); + Assert.Contains("completed", rows[1].Message, StringComparison.OrdinalIgnoreCase); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void ScriptExecutionActor_Failure_EmitsStartedInfoThenErrorEvent() + { + var compiled = CompileScript("throw new InvalidOperationException(\"boom\");"); + var replyTo = CreateTestProbe(); + var instanceActor = CreateTestProbe(); + var siteLog = new FakeSiteEventLogger(); + + var exec = ActorOf(Props.Create(() => new ScriptExecutionActor( + "Bad", "Inst1", compiled, null, 0, + instanceActor.Ref, _sharedLibrary, Options(), + replyTo.Ref, "corr-evt-2", NullLogger.Instance, + ScriptScope.Root, null, new SingleServiceProvider(siteLog)))); + + Watch(exec); + replyTo.ExpectMsg(TimeSpan.FromSeconds(10)); + ExpectTerminated(exec, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("script"); + // started (Info) + failed (Error) — no completed. + Assert.Equal(2, rows.Count); + Assert.Equal("Info", rows[0].Severity); + Assert.Contains("started", rows[0].Message, StringComparison.OrdinalIgnoreCase); + Assert.Equal("Error", rows[1].Severity); + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void ScriptExecutionActor_ScriptThrows_RepliesFailureAndStops() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs index b83ca94c..7ea11242 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/InstanceActorTests.cs @@ -10,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; using System.Text.Json; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -58,6 +59,82 @@ public class InstanceActorTests : TestKit, IDisposable try { File.Delete(_dbFile); } catch { /* cleanup */ } } + // ── M1.6: site event log `instance_lifecycle` category ────────────────── + + [Fact] + public void InstanceActor_Start_EmitsInstanceLifecycleSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var config = new FlattenedConfiguration + { + InstanceUniqueName = "LifecyclePump", + Attributes = [new ResolvedAttribute { CanonicalName = "T", Value = "1", DataType = "Int32" }] + }; + + ActorOf(Props.Create(() => new InstanceActor( + "LifecyclePump", + JsonSerializer.Serialize(config), + _storage, + _compilationService, + _sharedScriptLibrary, + null, + _options, + NullLogger.Instance, + null, + null, + new SingleServiceProvider(siteLog)))); + + AwaitAssert(() => + { + var rows = siteLog.OfType("instance_lifecycle"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "LifecyclePump" && + r.Source == "InstanceActor:LifecyclePump" && + r.Message.Contains("started", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void InstanceActor_Stop_EmitsInstanceLifecycleSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var config = new FlattenedConfiguration + { + InstanceUniqueName = "StoppedPump", + Attributes = [new ResolvedAttribute { CanonicalName = "T", Value = "1", DataType = "Int32" }] + }; + + var actor = ActorOf(Props.Create(() => new InstanceActor( + "StoppedPump", + JsonSerializer.Serialize(config), + _storage, + _compilationService, + _sharedScriptLibrary, + null, + _options, + NullLogger.Instance, + null, + null, + new SingleServiceProvider(siteLog)))); + + // Let PreStart land its started event, then stop the actor. + AwaitAssert(() => Assert.NotEmpty(siteLog.OfType("instance_lifecycle")), + TimeSpan.FromSeconds(2)); + Watch(actor); + actor.Tell(PoisonPill.Instance); + ExpectTerminated(actor, TimeSpan.FromSeconds(5)); + + AwaitAssert(() => + { + var rows = siteLog.OfType("instance_lifecycle"); + Assert.Contains(rows, r => + r.Severity == "Info" && + r.InstanceId == "StoppedPump" && + r.Message.Contains("stopped", StringComparison.OrdinalIgnoreCase)); + }, TimeSpan.FromSeconds(2)); + } + [Fact] public void InstanceActor_LoadsAttributesFromConfig() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs index ac9614b9..abcd0766 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/NativeAlarmActorTests.cs @@ -1,3 +1,4 @@ +using System.Text.Json; using Akka.Actor; using Akka.TestKit.Xunit2; using Microsoft.Extensions.Logging.Abstractions; @@ -9,6 +10,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening; using ZB.MOM.WW.ScadaBridge.SiteRuntime; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors; using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence; +using ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors; @@ -41,9 +43,10 @@ public class NativeAlarmActorTests : TestKit, IDisposable new(sourceRef, "T01", "AnalogLimit.Hi", kind, condition, "Process", "hi", "hi", "", "", null, time ?? DateTimeOffset.UtcNow, "92", "90"); - private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager) => + private IActorRef Spawn(IActorRef instanceActor, IActorRef dclManager, IServiceProvider? serviceProvider = null) => ActorOf(Props.Create(() => new NativeAlarmActor( - Source(), "inst", instanceActor, dclManager, _storage, _options, NullLogger.Instance))); + Source(), "inst", instanceActor, dclManager, _storage, _options, + NullLogger.Instance, AlarmKind.NativeOpcUa, serviceProvider))); [Fact] public void SubscribeOnStart_SendsRequestForSourceBinding() @@ -121,6 +124,158 @@ public class NativeAlarmActorTests : TestKit, IDisposable instance.ExpectNoMsg(TimeSpan.FromMilliseconds(300)); } + // ── M1.5: site event log `alarm` category ────────────────────────────── + + [Fact] + public void Raise_EmitsAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Single(rows); + var row = rows[0]; + Assert.Equal("Error", row.Severity); // severity 800 → Error + Assert.Equal("inst", row.InstanceId); + Assert.Equal("NativeAlarmActor:Pressure", row.Source); + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public void Clear_EmitsInfoAlarmSiteEvent() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + // Clear (inactive but not yet acked → stays mirrored, return-to-normal emit). + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Clear, + new AlarmConditionState(false, false, null, AlarmShelveState.Unshelved, false, 0), t0.AddSeconds(5)))); + instance.ExpectMsg(m => m.State == AlarmState.Normal); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + clear + Assert.Equal("Error", rows[0].Severity); + Assert.Equal("Info", rows[1].Severity); // return-to-normal → Info + }, TimeSpan.FromSeconds(2)); + } + + [Fact] + public async Task Rehydration_DoesNotEmitSiteEvent() + { + // Pre-populate SQLite with an active condition so the actor rehydrates + // it on PreStart. Rehydration replays last-known state — it is NOT a + // live transition, so it must surface upward (for the DebugView) but + // must NOT re-log an `alarm` operational event. + var condition = new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800); + await _storage.UpsertNativeAlarmAsync( + "inst", "Pressure", "T01.Hi", + JsonSerializer.Serialize(condition), DateTimeOffset.UtcNow); + + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + + // The rehydrated condition is surfaced upward... + var emitted = instance.ExpectMsg(TimeSpan.FromSeconds(2)); + Assert.Equal("T01.Hi", emitted.SourceReference); + Assert.Equal(AlarmState.Active, emitted.State); + dcl.ExpectMsg(); + + // ...but no `alarm` operational event is logged for it. + AwaitAssert( + () => Assert.Empty(siteLog.OfType("alarm")), + TimeSpan.FromSeconds(1)); + } + + [Fact] + public void SnapshotSwap_ExistingActiveCondition_DoesNotReEmit() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + // Live raise — the one and only `alarm` event we expect. + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + AwaitAssert(() => Assert.Single(siteLog.OfType("alarm")), TimeSpan.FromSeconds(2)); + + // A reconnect snapshot that RE-INCLUDES the same still-active condition is + // a re-sync, not a live transition. It must NOT re-log a second `alarm` + // event (regression for the spurious-reconnect-event bug). + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Snapshot, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.SnapshotComplete, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800)))); + + // The snapshot still surfaces the condition upward (DebugView re-sync)... + instance.ExpectMsg(m => m.SourceReference == "T01.Hi" && m.State == AlarmState.Active); + + // ...but the `alarm` event count stays at exactly 1 — no re-emit. + Thread.Sleep(200); // give any spurious fire-and-forget log time to land + Assert.Single(siteLog.OfType("alarm")); + } + + [Fact] + public void Acknowledge_EmitsInfoAlarmSiteEventMentioningAcknowledged() + { + var siteLog = new FakeSiteEventLogger(); + var instance = CreateTestProbe(); + var dcl = CreateTestProbe(); + var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog)); + dcl.ExpectMsg(); + + var t0 = DateTimeOffset.UtcNow; + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Raise, + new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0))); + instance.ExpectMsg(m => m.State == AlarmState.Active); + + // Operator acknowledges the still-active condition. The Acknowledge + // branch of LogAlarmEvent logs Info and mentions "acknowledged". + actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition( + "T01.Hi", AlarmTransitionKind.Acknowledge, + new AlarmConditionState(true, true, null, AlarmShelveState.Unshelved, false, 800), t0.AddSeconds(5)))); + instance.ExpectMsg(); + + AwaitAssert(() => + { + var rows = siteLog.OfType("alarm"); + Assert.Equal(2, rows.Count); // raise + acknowledge + var ack = rows[1]; + Assert.Equal("Info", ack.Severity); + Assert.Contains("acknowledged", ack.Message, StringComparison.OrdinalIgnoreCase); + }, TimeSpan.FromSeconds(2)); + } + void IDisposable.Dispose() { Shutdown(); diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs new file mode 100644 index 00000000..7617fe14 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/TestSupport/FakeSiteEventLogger.cs @@ -0,0 +1,83 @@ +using System.Collections.Concurrent; +using Microsoft.Extensions.DependencyInjection; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; + +namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.TestSupport; + +/// +/// M1 Site Event Logging categories: a capturing fake +/// used by the actor tests to assert that the right operational events are emitted. +/// Thread-safe — the actors fire-and-forget LogEventAsync from background +/// tasks, so multiple captures can land concurrently. +/// +public sealed class FakeSiteEventLogger : ISiteEventLogger +{ + /// One captured invocation. + public sealed record Entry( + string EventType, + string Severity, + string? InstanceId, + string Source, + string Message, + string? Details); + + private readonly ConcurrentQueue _entries = new(); + + /// All captured events, in arrival order. + public IReadOnlyList Entries => _entries.ToArray(); + + /// Captured events filtered to a single category. + public IReadOnlyList OfType(string eventType) => + _entries.Where(e => e.EventType == eventType).ToArray(); + + /// + public Task LogEventAsync( + string eventType, + string severity, + string? instanceId, + string source, + string message, + string? details = null) + { + _entries.Enqueue(new Entry(eventType, severity, instanceId, source, message, details)); + return Task.CompletedTask; + } + + /// + public long FailedWriteCount => 0; +} + +/// +/// Minimal that resolves a single +/// — enough for the actors' optional +/// _serviceProvider?.GetService<ISiteEventLogger>() resolution +/// without pulling a full DI container into the actor tests. +/// +/// Also serves (returning a scope that just +/// re-exposes this provider) so callers that do +/// serviceProvider.CreateScope() — e.g. ScriptExecutionActor — +/// don't throw before they reach the logging hot path. +/// +/// +public sealed class SingleServiceProvider(ISiteEventLogger logger) + : IServiceProvider, IServiceScopeFactory, IServiceScope +{ + private readonly ISiteEventLogger _logger = logger; + + /// + public object? GetService(Type serviceType) + { + if (serviceType == typeof(ISiteEventLogger)) return _logger; + if (serviceType == typeof(IServiceScopeFactory)) return this; + return null; + } + + /// + public IServiceScope CreateScope() => this; + + /// + public IServiceProvider ServiceProvider => this; + + /// + public void Dispose() { } +} diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs index f9425ec8..a89fc398 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Tracking/OperationTrackingStoreTests.cs @@ -439,6 +439,138 @@ public class OperationTrackingStoreTests Assert.NotNull(await store.GetStatusAsync(cId)); // kept (non-terminal) } + // ── Site Call Audit #22: ReadChangedSinceAsync (reconciliation pull) ─── + + [Fact] + public async Task ReadChangedSinceAsync_ReturnsRowsAtOrAfterCursor_OldestFirst() + { + var (store, dataSource) = CreateStore(nameof(ReadChangedSinceAsync_ReturnsRowsAtOrAfterCursor_OldestFirst)); + await using var _store = store; + + // Three rows with distinct UpdatedAtUtc, written out of chronological + // order to prove the read sorts by UpdatedAtUtc ascending. + var older = TrackedOperationId.New(); + var middle = TrackedOperationId.New(); + var newer = TrackedOperationId.New(); + await store.RecordEnqueueAsync(older, nameof(AuditKind.ApiCallCached), "ERP.A", null, null, "node-a"); + await store.RecordEnqueueAsync(middle, nameof(AuditKind.DbWriteCached), "DB.B", null, null, "node-b"); + await store.RecordEnqueueAsync(newer, nameof(AuditKind.ApiCallCached), "ERP.C", null, null, null); + + // Backdate UpdatedAtUtc so the ordering is deterministic and a cursor + // can be placed cleanly between rows. (Enqueue stamps DateTime.UtcNow; + // we cannot inject the clock, so set the timestamps directly.) + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + SetUpdatedAt(dataSource, older, t0); + SetUpdatedAt(dataSource, middle, t0.AddMinutes(10)); + SetUpdatedAt(dataSource, newer, t0.AddMinutes(20)); + + // Cursor at the middle row's UpdatedAtUtc: inclusive lower bound, so + // middle + newer come back, older is excluded. + var result = await store.ReadChangedSinceAsync(t0.AddMinutes(10), batchSize: 100, CancellationToken.None); + + Assert.Equal(2, result.Count); + Assert.Equal(middle, result[0].TrackedOperationId); + Assert.Equal(newer, result[1].TrackedOperationId); + Assert.True(result[0].UpdatedAtUtc <= result[1].UpdatedAtUtc); + } + + [Fact] + public async Task ReadChangedSinceAsync_FromMinValue_ReturnsAllRows() + { + var (store, _) = CreateStore(nameof(ReadChangedSinceAsync_FromMinValue_ReturnsAllRows)); + await using var _store = store; + + await store.RecordEnqueueAsync(TrackedOperationId.New(), nameof(AuditKind.ApiCallCached), "A", null, null, null); + await store.RecordEnqueueAsync(TrackedOperationId.New(), nameof(AuditKind.ApiCallCached), "B", null, null, null); + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 100, CancellationToken.None); + + Assert.Equal(2, result.Count); + } + + [Fact] + public async Task ReadChangedSinceAsync_IsBatchCapped() + { + var (store, dataSource) = CreateStore(nameof(ReadChangedSinceAsync_IsBatchCapped)); + await using var _store = store; + + var ids = new List(); + var t0 = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + for (var i = 0; i < 5; i++) + { + var id = TrackedOperationId.New(); + ids.Add(id); + await store.RecordEnqueueAsync(id, nameof(AuditKind.ApiCallCached), $"T{i}", null, null, null); + SetUpdatedAt(dataSource, id, t0.AddMinutes(i)); + } + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 3, CancellationToken.None); + + // Capped to 3 — and the cap takes the OLDEST 3 (asc order) so the + // caller can advance the cursor monotonically across follow-up pulls. + Assert.Equal(3, result.Count); + Assert.Equal(ids[0], result[0].TrackedOperationId); + Assert.Equal(ids[1], result[1].TrackedOperationId); + Assert.Equal(ids[2], result[2].TrackedOperationId); + } + + [Fact] + public async Task ReadChangedSinceAsync_MapsTrackingRowOntoSiteCallOperational() + { + var (store, _) = CreateStore(nameof(ReadChangedSinceAsync_MapsTrackingRowOntoSiteCallOperational)); + await using var _store = store; + + var apiId = TrackedOperationId.New(); + var dbId = TrackedOperationId.New(); + await store.RecordEnqueueAsync(apiId, nameof(AuditKind.ApiCallCached), "ERP.GetOrder", "inst-1", "ScriptActor:OnTick", "node-a"); + await store.RecordEnqueueAsync(dbId, nameof(AuditKind.DbWriteCached), "Historian.Write", null, null, "node-b"); + await store.RecordAttemptAsync(apiId, nameof(AuditStatus.Attempted), 2, "HTTP 503", 503); + await store.RecordTerminalAsync(dbId, nameof(AuditStatus.Parked), "max retries", null); + + var result = await store.ReadChangedSinceAsync(DateTime.MinValue, batchSize: 100, CancellationToken.None); + var api = result.Single(r => r.TrackedOperationId == apiId); + var db = result.Single(r => r.TrackedOperationId == dbId); + + // Kind → Channel projection. + Assert.Equal("ApiOutbound", api.Channel); + Assert.Equal("DbOutbound", db.Channel); + + // TargetSummary → Target; SourceNode carried verbatim. + Assert.Equal("ERP.GetOrder", api.Target); + Assert.Equal("node-a", api.SourceNode); + Assert.Equal("node-b", db.SourceNode); + + // Status / RetryCount / LastError / HttpStatus carried through. + Assert.Equal(nameof(AuditStatus.Attempted), api.Status); + Assert.Equal(2, api.RetryCount); + Assert.Equal("HTTP 503", api.LastError); + Assert.Equal(503, api.HttpStatus); + + // SourceSite is left empty by the store (the site id is not a tracking + // column); the central client re-stamps it from the dialed siteId. + Assert.Equal(string.Empty, api.SourceSite); + + // Terminal row carries TerminalAtUtc (UTC kind); active row leaves it null. + Assert.Null(api.TerminalAtUtc); + Assert.NotNull(db.TerminalAtUtc); + Assert.Equal(DateTimeKind.Utc, db.TerminalAtUtc!.Value.Kind); + + // Timestamps round-trip as UTC. + Assert.Equal(DateTimeKind.Utc, api.CreatedAtUtc.Kind); + Assert.Equal(DateTimeKind.Utc, api.UpdatedAtUtc.Kind); + } + + /// Directly sets a row's UpdatedAtUtc so cursor/ordering tests are deterministic. + private static void SetUpdatedAt(string dataSource, TrackedOperationId id, DateTime updatedAtUtc) + { + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "UPDATE OperationTracking SET UpdatedAtUtc = $u WHERE TrackedOperationId = $id;"; + cmd.Parameters.AddWithValue("$u", updatedAtUtc.ToString("o", System.Globalization.CultureInfo.InvariantCulture)); + cmd.Parameters.AddWithValue("$id", id.ToString()); + cmd.ExecuteNonQuery(); + } + // ── SiteRuntime-024: read/write split + sync-safe Dispose ────────────── [Fact] diff --git a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs new file mode 100644 index 00000000..2c3dcea7 --- /dev/null +++ b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/StoreAndForwardSiteEventTests.cs @@ -0,0 +1,168 @@ +using System.Collections.Concurrent; +using Microsoft.Data.Sqlite; +using Microsoft.Extensions.Logging.Abstractions; +using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums; +using ZB.MOM.WW.ScadaBridge.SiteEventLogging; + +namespace ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests; + +/// +/// M1.7: the StoreAndForwardService emits site operational events for its own +/// buffer/park activity — store_and_forward for cached-call categories +/// (ExternalSystem / CachedDbWrite) and notification for the site's +/// notification forward-to-central path. Emission rides the existing +/// OnActivity hook and is best-effort (a failing logger never affects +/// delivery bookkeeping). +/// +public class StoreAndForwardSiteEventTests : IAsyncLifetime, IDisposable +{ + private sealed record Entry(string EventType, string Severity, string Source, string Message); + + private sealed class FakeSiteEventLogger : ISiteEventLogger + { + private readonly ConcurrentQueue _entries = new(); + public IReadOnlyList Entries => _entries.ToArray(); + public IReadOnlyList OfType(string t) => _entries.Where(e => e.EventType == t).ToArray(); + + public Task LogEventAsync(string eventType, string severity, string? instanceId, + string source, string message, string? details = null) + { + _entries.Enqueue(new Entry(eventType, severity, source, message)); + return Task.CompletedTask; + } + + public long FailedWriteCount => 0; + } + + private readonly SqliteConnection _keepAlive; + private readonly StoreAndForwardStorage _storage; + private readonly StoreAndForwardOptions _options; + private readonly FakeSiteEventLogger _siteLog = new(); + private readonly StoreAndForwardService _service; + + public StoreAndForwardSiteEventTests() + { + var dbName = $"SiteEvt_{Guid.NewGuid():N}"; + var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared"; + _keepAlive = new SqliteConnection(connStr); + _keepAlive.Open(); + + _storage = new StoreAndForwardStorage(connStr, NullLogger.Instance); + _options = new StoreAndForwardOptions + { + DefaultRetryInterval = TimeSpan.Zero, + DefaultMaxRetries = 1, + RetryTimerInterval = TimeSpan.FromMinutes(10) + }; + + _service = new StoreAndForwardService( + _storage, _options, NullLogger.Instance, + replication: null, cachedCallObserver: null, siteId: "site-a", + siteEventLogger: _siteLog); + } + + public async Task InitializeAsync() => await _storage.InitializeAsync(); + public Task DisposeAsync() => Task.CompletedTask; + public void Dispose() => _keepAlive.Dispose(); + + [Fact] + public async Task BufferForRetry_ExternalSystem_EmitsStoreAndForwardSiteEvent() + { + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => throw new HttpRequestException("transient")); + + await _service.EnqueueAsync(StoreAndForwardCategory.ExternalSystem, "api.example.com", """{}""", "Pump1"); + + var rows = _siteLog.OfType("store_and_forward"); + Assert.Contains(rows, r => r.Severity == "Warning" && + r.Source == "StoreAndForwardService" && + r.Message.Contains("queued", StringComparison.OrdinalIgnoreCase)); + // The cached-call categories must NOT surface as notification events. + Assert.Empty(_siteLog.OfType("notification")); + } + + [Fact] + public async Task ForwardFailure_Notification_EmitsNotificationSiteEvent() + { + // The site's notification role is forward-to-central. When the immediate + // forward to central throws (central unreachable), the notification is + // buffered for retry — a forward FAILURE, which the spec says to log as a + // `notification` site event (filling the in-transit blind spot). + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => throw new HttpRequestException("central unreachable")); + + await _service.EnqueueAsync(StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1"); + + var rows = _siteLog.OfType("notification"); + Assert.Contains(rows, r => r.Severity == "Warning" && + r.Source == "StoreAndForwardService" && + r.Message.Contains("queued", StringComparison.OrdinalIgnoreCase)); + // A notification forward-failure is not a store_and_forward (cached-call) event. + Assert.Empty(_siteLog.OfType("store_and_forward")); + } + + [Fact] + public async Task RoutineEnqueue_Notification_DoesNotEmitSiteEvent() + { + // Spec: routine enqueue / forward-success on the notification path are + // deliberately NOT logged — central's Notifications table is the audit + // record of record. A successful immediate forward emits no site event. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => Task.FromResult(true)); + + await _service.EnqueueAsync(StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1"); + + Assert.Empty(_siteLog.OfType("notification")); + } + + [Fact] + public async Task Park_Notification_EmitsErrorNotificationSiteEvent() + { + // A long-buffered notification that exhausts retries is parked — the spec + // logs this as a `notification` event (Error severity). + _service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification, + _ => throw new HttpRequestException("central unreachable")); + + await _service.EnqueueAsync( + StoreAndForwardCategory.Notification, "list-a", """{}""", "Pump1", + attemptImmediateDelivery: false, maxRetries: 1); + + await _service.RetryPendingMessagesAsync(); + + var rows = _siteLog.OfType("notification"); + Assert.Contains(rows, r => r.Severity == "Error" && + r.Message.Contains("parked", StringComparison.OrdinalIgnoreCase)); + } + + [Fact] + public async Task Park_ExternalSystem_EmitsErrorStoreAndForwardSiteEvent() + { + // MaxRetries = 1 → the first sweep retry parks the message. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => throw new HttpRequestException("transient")); + + await _service.EnqueueAsync( + StoreAndForwardCategory.ExternalSystem, "api.example.com", """{}""", "Pump1", + attemptImmediateDelivery: false, maxRetries: 1); + + await _service.RetryPendingMessagesAsync(); + + var rows = _siteLog.OfType("store_and_forward"); + Assert.Contains(rows, r => r.Severity == "Error" && + r.Message.Contains("parked", StringComparison.OrdinalIgnoreCase)); + } + + [Fact] + public async Task DeliveredImmediately_DoesNotEmitSiteEvent() + { + // A successful immediate delivery is the normal hot path — it is not a + // store-and-forward buffering event, so no operational event is logged. + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => Task.FromResult(true)); + + await _service.EnqueueAsync(StoreAndForwardCategory.ExternalSystem, "api", """{}""", "Pump1"); + + Assert.Empty(_siteLog.OfType("store_and_forward")); + Assert.Empty(_siteLog.OfType("notification")); + } +}