refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,236 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): translates per-attempt
|
||||
/// notifications from the store-and-forward retry loop into one (or two)
|
||||
/// <see cref="CachedCallTelemetry"/> packets and pushes them through
|
||||
/// <see cref="ICachedCallTelemetryForwarder"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The S&F loop's <see cref="ICachedCallLifecycleObserver"/> reports a
|
||||
/// single coarse outcome per attempt; the audit pipeline however models the
|
||||
/// lifecycle as TWO rows on terminal outcomes — an <c>Attempted</c>
|
||||
/// (<see cref="AuditKind.ApiCallCached"/> / <see cref="AuditKind.DbWriteCached"/>)
|
||||
/// row capturing the per-attempt mechanics, plus a <see cref="AuditKind.CachedResolve"/>
|
||||
/// row marking the terminal state for downstream consumers. The bridge fans
|
||||
/// out per outcome:
|
||||
/// </para>
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>TransientFailure</c> -> one Attempted(Failed) row.</description></item>
|
||||
/// <item><description><c>Delivered</c> -> Attempted(Delivered) + CachedResolve(Delivered).</description></item>
|
||||
/// <item><description><c>PermanentFailure</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// <item><description><c>ParkedMaxRetries</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// <b>Best-effort emission (alog.md §7):</b> the bridge itself never throws;
|
||||
/// the underlying forwarder swallows + logs its own failures.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
|
||||
{
|
||||
private readonly ICachedCallTelemetryForwarder _forwarder;
|
||||
private readonly ILogger<CachedCallLifecycleBridge> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// SourceNode-stamping (Task 14): the local node identity provider used to
|
||||
/// stamp <c>SiteCallOperational.SourceNode</c> on every cached-call
|
||||
/// lifecycle row this bridge emits. Optional — when null (legacy hosts /
|
||||
/// tests that don't register the provider) SourceNode stays null and
|
||||
/// central persists the <c>SiteCalls</c> row with SourceNode NULL.
|
||||
/// </summary>
|
||||
private readonly INodeIdentityProvider? _nodeIdentity;
|
||||
|
||||
/// <summary>Initializes a new <see cref="CachedCallLifecycleBridge"/> with the given telemetry forwarder, logger, and optional node identity provider.</summary>
|
||||
/// <param name="forwarder">The telemetry forwarder used to ship cached-call lifecycle events to central.</param>
|
||||
/// <param name="logger">Logger for bridge diagnostics.</param>
|
||||
/// <param name="nodeIdentity">Optional node identity provider used to stamp <c>SourceNode</c> on emitted telemetry rows.</param>
|
||||
public CachedCallLifecycleBridge(
|
||||
ICachedCallTelemetryForwarder forwarder,
|
||||
ILogger<CachedCallLifecycleBridge> logger,
|
||||
INodeIdentityProvider? nodeIdentity = null)
|
||||
{
|
||||
_forwarder = forwarder ?? throw new ArgumentNullException(nameof(forwarder));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_nodeIdentity = nodeIdentity;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task OnAttemptCompletedAsync(
|
||||
CachedCallAttemptContext context, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
try
|
||||
{
|
||||
await EmitAttemptedAsync(context, ct).ConfigureAwait(false);
|
||||
|
||||
if (IsTerminal(context.Outcome))
|
||||
{
|
||||
await EmitResolveAsync(context, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Defensive — both EmitX paths call the forwarder which is itself
|
||||
// best-effort. A throw here is unexpected, but the alog.md §7
|
||||
// contract requires we never propagate.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallLifecycleBridge: unexpected throw for {TrackedOperationId} (Outcome {Outcome})",
|
||||
context.TrackedOperationId, context.Outcome);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task EmitAttemptedAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
// Per-attempt row: kind discriminates channel; status is always
|
||||
// Attempted regardless of outcome (success vs. failure is captured
|
||||
// by the companion HttpStatus / ErrorMessage fields, NOT by flipping
|
||||
// the status — CachedResolve carries the terminal Status). Per the
|
||||
// M3 brief and alog.md §4.
|
||||
var kind = ChannelToAttemptKind(context.Channel);
|
||||
var status = AuditStatus.Attempted;
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: kind,
|
||||
status: status,
|
||||
// Operational status mirror — for the per-attempt row the
|
||||
// operational state is the running status; the bridge always
|
||||
// writes "Attempted" so reconciliation can't roll back.
|
||||
operationalStatus: "Attempted",
|
||||
terminalAtUtc: null,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task EmitResolveAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
var (auditStatus, operationalStatus) = TerminalOutcomeToStatuses(context.Outcome);
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: AuditKind.CachedResolve,
|
||||
status: auditStatus,
|
||||
operationalStatus: operationalStatus,
|
||||
terminalAtUtc: context.OccurredAtUtc,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private CachedCallTelemetry BuildPacket(
|
||||
CachedCallAttemptContext context,
|
||||
AuditKind kind,
|
||||
AuditStatus status,
|
||||
string operationalStatus,
|
||||
DateTime? terminalAtUtc,
|
||||
string? lastError,
|
||||
int? httpStatus)
|
||||
{
|
||||
var channel = ChannelStringToEnum(context.Channel);
|
||||
|
||||
return new CachedCallTelemetry(
|
||||
Audit: new AuditEvent
|
||||
{
|
||||
EventId = Guid.NewGuid(),
|
||||
OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
Channel = channel,
|
||||
Kind = kind,
|
||||
CorrelationId = context.TrackedOperationId.Value,
|
||||
// Audit Log #23 (ExecutionId Task 4): the originating script
|
||||
// execution's per-run correlation id, threaded through the S&F
|
||||
// buffer; null on rows buffered before Task 4 (back-compat).
|
||||
ExecutionId = context.ExecutionId,
|
||||
// Audit Log #23 (ParentExecutionId Task 6): the spawning
|
||||
// inbound-API request's ExecutionId, threaded through the S&F
|
||||
// buffer alongside ExecutionId so the retry-loop cached rows
|
||||
// correlate back to the cross-execution chain. Null for a
|
||||
// non-routed run and on rows buffered before Task 6.
|
||||
ParentExecutionId = context.ParentExecutionId,
|
||||
SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
|
||||
SourceInstanceId = context.SourceInstanceId,
|
||||
// Audit Log #23 (ExecutionId Task 4): SourceScript is now
|
||||
// threaded through the S&F buffer alongside ExecutionId — the
|
||||
// retry-loop cached rows carry the same provenance the
|
||||
// script-side cached rows do. Null on pre-Task-4 buffered rows.
|
||||
SourceScript = context.SourceScript,
|
||||
Target = context.Target,
|
||||
Status = status,
|
||||
HttpStatus = httpStatus,
|
||||
DurationMs = context.DurationMs,
|
||||
ErrorMessage = lastError,
|
||||
ForwardState = AuditForwardState.Pending,
|
||||
},
|
||||
Operational: new SiteCallOperational(
|
||||
TrackedOperationId: context.TrackedOperationId,
|
||||
Channel: context.Channel,
|
||||
Target: context.Target,
|
||||
SourceSite: context.SourceSite,
|
||||
// SourceNode-stamping (Task 14): the local cluster node name
|
||||
// (node-a/node-b on a site). Stamped from the injected
|
||||
// INodeIdentityProvider; null when no provider was wired so
|
||||
// central persists SiteCalls.SourceNode as NULL.
|
||||
SourceNode: _nodeIdentity?.NodeName,
|
||||
Status: operationalStatus,
|
||||
RetryCount: context.RetryCount,
|
||||
LastError: lastError,
|
||||
HttpStatus: httpStatus,
|
||||
CreatedAtUtc: DateTime.SpecifyKind(context.CreatedAtUtc, DateTimeKind.Utc),
|
||||
UpdatedAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
TerminalAtUtc: terminalAtUtc is null
|
||||
? null
|
||||
: DateTime.SpecifyKind(terminalAtUtc.Value, DateTimeKind.Utc)));
|
||||
}
|
||||
|
||||
private static AuditKind ChannelToAttemptKind(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditKind.ApiCallCached,
|
||||
"DbOutbound" => AuditKind.DbWriteCached,
|
||||
// Defensive default — the S&F observer is filtered to cached-call
|
||||
// categories so this branch shouldn't fire in practice.
|
||||
_ => AuditKind.ApiCallCached,
|
||||
};
|
||||
|
||||
private static AuditChannel ChannelStringToEnum(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditChannel.ApiOutbound,
|
||||
"DbOutbound" => AuditChannel.DbOutbound,
|
||||
_ => AuditChannel.ApiOutbound,
|
||||
};
|
||||
|
||||
private static (AuditStatus auditStatus, string operationalStatus) TerminalOutcomeToStatuses(
|
||||
CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered =>
|
||||
(AuditStatus.Delivered, "Delivered"),
|
||||
CachedCallAttemptOutcome.PermanentFailure =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
// TransientFailure isn't terminal — see IsTerminal — but the switch
|
||||
// is exhaustive so we route it through Failed for safety.
|
||||
CachedCallAttemptOutcome.TransientFailure =>
|
||||
(AuditStatus.Failed, "Failed"),
|
||||
_ => (AuditStatus.Failed, "Failed"),
|
||||
};
|
||||
|
||||
private static bool IsTerminal(CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered => true,
|
||||
CachedCallAttemptOutcome.PermanentFailure => true,
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries => true,
|
||||
CachedCallAttemptOutcome.TransientFailure => false,
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,194 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side dual emitter for cached-call lifecycle telemetry (Audit Log #23 /
|
||||
/// M3). Sister to <see cref="SiteAuditTelemetryActor"/>: where the M2 actor
|
||||
/// drains audit-only events, this forwarder takes a combined
|
||||
/// <see cref="CachedCallTelemetry"/> packet and fans it out to the two
|
||||
/// site-local stores in a single call:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>The <see cref="AuditEvent"/> row is written via
|
||||
/// <see cref="IAuditWriter"/> (the site <c>FallbackAuditWriter</c> +
|
||||
/// <c>SqliteAuditWriter</c> chain established in M2).</description></item>
|
||||
/// <item><description>The operational <see cref="SiteCallOperational"/> half
|
||||
/// updates the site-local <c>OperationTracking</c> SQLite store via
|
||||
/// <see cref="IOperationTrackingStore"/>, with the per-lifecycle method
|
||||
/// (<c>Enqueue</c> / <c>Attempt</c> / <c>Terminal</c>) selected from the
|
||||
/// audit row's <see cref="AuditKind"/>.</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract (alog.md §7):</b> a thrown writer OR a thrown
|
||||
/// tracking store must never propagate to the calling script. Both emission
|
||||
/// halves are wrapped in independent try/catch blocks so a SQLite outage on
|
||||
/// one side cannot starve the other — the failure is logged and the call
|
||||
/// returns normally.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Local-write only — the wire push is the drain actor's job.</b> This
|
||||
/// forwarder is deliberately synchronous against the two site-local SQLite
|
||||
/// stores and never pushes to central itself. The site→central transport is
|
||||
/// now live: <c>ClusterClientSiteAuditClient</c> is the production binding of
|
||||
/// <see cref="ISiteStreamAuditClient"/> on site roles (with
|
||||
/// <c>NoOpSiteStreamAuditClient</c> retained only for central/test composition
|
||||
/// roots). The push happens out-of-band: <see cref="SiteAuditTelemetryActor"/>
|
||||
/// sweeps the <c>AuditEvent</c> rows this forwarder wrote — they live in SQLite
|
||||
/// tagged <see cref="AuditForwardState.Pending"/> — and drains them to central
|
||||
/// via that client. A single drain loop therefore covers both the audit-only
|
||||
/// emissions and the cached-call emissions this forwarder produces.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
|
||||
{
|
||||
private readonly IAuditWriter _auditWriter;
|
||||
private readonly IOperationTrackingStore? _trackingStore;
|
||||
private readonly ILogger<CachedCallTelemetryForwarder> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// SourceNode-stamping (Task 14): local node identity provider used to
|
||||
/// stamp the tracking-store row's <c>SourceNode</c> column on
|
||||
/// <c>RecordEnqueueAsync</c>. Optional — when null (legacy / test hosts)
|
||||
/// the column stays NULL on the tracking row.
|
||||
/// </summary>
|
||||
private readonly INodeIdentityProvider? _nodeIdentity;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the forwarder. <paramref name="trackingStore"/> is optional —
|
||||
/// when null only the audit half of the packet is emitted, which matches
|
||||
/// the M3 Bundle F composition-root contract on Central nodes: the
|
||||
/// AuditLog DI surface registers the forwarder unconditionally (mirroring
|
||||
/// the IAuditWriter chain) but the site-only tracking store has no central
|
||||
/// registration. Production site nodes wire both — the central lazy
|
||||
/// resolution is a no-op path kept symmetric with the M2 writer chain.
|
||||
/// </summary>
|
||||
/// <param name="auditWriter">Writer used to persist audit events from the telemetry packet.</param>
|
||||
/// <param name="trackingStore">Optional store for updating operation tracking state; null on central nodes.</param>
|
||||
/// <param name="logger">Logger for this forwarder.</param>
|
||||
/// <param name="nodeIdentity">Optional provider of the current node name stamped on emitted rows.</param>
|
||||
public CachedCallTelemetryForwarder(
|
||||
IAuditWriter auditWriter,
|
||||
IOperationTrackingStore? trackingStore,
|
||||
ILogger<CachedCallTelemetryForwarder> logger,
|
||||
INodeIdentityProvider? nodeIdentity = null)
|
||||
{
|
||||
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_trackingStore = trackingStore;
|
||||
_nodeIdentity = nodeIdentity;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task ForwardAsync(CachedCallTelemetry telemetry, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(telemetry);
|
||||
|
||||
// Independent try/catch — a thrown audit writer must not prevent the
|
||||
// tracking-store update from running (and vice-versa). Both halves
|
||||
// are best-effort.
|
||||
await TryEmitAuditAsync(telemetry, ct).ConfigureAwait(false);
|
||||
await TryEmitTrackingAsync(telemetry, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task TryEmitAuditAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _auditWriter.WriteAsync(telemetry.Audit, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// alog.md §7 best-effort contract — log and swallow. The audit
|
||||
// pipeline's own retry/recovery (RingBufferFallback in the
|
||||
// FallbackAuditWriter) handles transient writer failures upstream;
|
||||
// a throw bubbling up here means the writer's own swallow contract
|
||||
// failed, which is itself best-effort-handled.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
|
||||
telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryEmitTrackingAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
if (_trackingStore is null)
|
||||
{
|
||||
// No site-local tracking store wired — Central composition root or
|
||||
// an integration-test host that skipped AddSiteRuntime. Emitting
|
||||
// through the audit half is still meaningful; the tracking half
|
||||
// is a no-op rather than an error.
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
switch (telemetry.Audit.Kind)
|
||||
{
|
||||
case AuditKind.CachedSubmit:
|
||||
// Enqueue — insert-if-not-exists with the operational
|
||||
// channel as the kind discriminator. RetryCount is fixed
|
||||
// at 0 by the tracking store's INSERT contract.
|
||||
// SourceNode-stamping (Task 14): stamp the local node
|
||||
// name (node-a/node-b) from the injected
|
||||
// INodeIdentityProvider; null when no provider was wired
|
||||
// so the tracking row's SourceNode column stays NULL.
|
||||
await _trackingStore.RecordEnqueueAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Channel,
|
||||
telemetry.Operational.Target,
|
||||
telemetry.Audit.SourceInstanceId,
|
||||
telemetry.Audit.SourceScript,
|
||||
sourceNode: _nodeIdentity?.NodeName,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.ApiCallCached:
|
||||
case AuditKind.DbWriteCached:
|
||||
// Attempt — advance retry counter + last-error/HTTP-status.
|
||||
// Terminal rows are guarded by the store's WHERE clause.
|
||||
await _trackingStore.RecordAttemptAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.RetryCount,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.CachedResolve:
|
||||
// Terminal — first-write-wins on the resolve flip.
|
||||
await _trackingStore.RecordTerminalAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Defensive — only the four cached-lifecycle kinds are
|
||||
// expected on this path. Anything else is logged so a
|
||||
// mis-routed packet is visible but never crashes the
|
||||
// forwarder.
|
||||
_logger.LogWarning(
|
||||
"CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
|
||||
telemetry.Audit.Kind, telemetry.Audit.EventId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: tracking-store emission threw for TrackedOperationId {Id} (Status {Status})",
|
||||
telemetry.Operational.TrackedOperationId, telemetry.Operational.Status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
using Akka.Actor;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Production <see cref="ISiteStreamAuditClient"/> binding for site composition
|
||||
/// roots: pushes audit telemetry to central over Akka <c>ClusterClient</c> via
|
||||
/// the site's <c>SiteCommunicationActor</c>. The actor forwards the command to
|
||||
/// <c>/user/central-communication</c> and the central
|
||||
/// <c>CentralCommunicationActor</c> Asks the <c>AuditLogIngestActor</c> proxy —
|
||||
/// the same command/control transport notifications already use. Wired by the
|
||||
/// Host for site roles; central and test composition roots keep the
|
||||
/// <see cref="NoOpSiteStreamAuditClient"/> DI default (they have no
|
||||
/// <c>SiteCommunicationActor</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Throw-on-failure contract.</b> An Ask timeout or a faulted reply
|
||||
/// (<see cref="Status.Failure"/>) propagates as a thrown exception out of the
|
||||
/// <c>Ingest*Async</c> methods — it is NOT caught and turned into an empty ack.
|
||||
/// The <see cref="SiteAuditTelemetryActor"/> drain loop treats a thrown
|
||||
/// exception as transient and leaves the rows <c>Pending</c> for the next tick.
|
||||
/// Swallowing the fault into an empty ack would be indistinguishable from "zero
|
||||
/// rows accepted" and would silently lose the retry signal. Task 1 confirmed
|
||||
/// the central receiving end does not collapse an ingest fault into an empty
|
||||
/// ack either, so a site-side Ask through the whole path faults cleanly on a
|
||||
/// central-side timeout.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The batches arrive as proto DTOs (<see cref="AuditEventBatch"/> /
|
||||
/// <see cref="CachedTelemetryBatch"/>) because the
|
||||
/// <see cref="SiteAuditTelemetryActor"/> builds them with
|
||||
/// <see cref="AuditEventDtoMapper.ToDto"/>. This client converts them back into
|
||||
/// the <see cref="AuditEvent"/> / <see cref="SiteCall"/> entities the Akka
|
||||
/// command messages carry — the same DTO→entity translation the
|
||||
/// <c>SiteStreamGrpcServer</c> performs for the gRPC reconciliation path.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
|
||||
{
|
||||
private readonly IActorRef _siteCommunicationActor;
|
||||
private readonly TimeSpan _askTimeout;
|
||||
|
||||
/// <param name="siteCommunicationActor">
|
||||
/// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
|
||||
/// over the registered central ClusterClient and routes the reply back to
|
||||
/// this client's Ask.
|
||||
/// </param>
|
||||
/// <param name="askTimeout">
|
||||
/// Ask timeout for the round-trip to central. On expiry the Ask throws
|
||||
/// <see cref="Akka.Actor.AskTimeoutException"/>, which the drain loop treats
|
||||
/// as transient (rows stay <c>Pending</c>).
|
||||
/// </param>
|
||||
public ClusterClientSiteAuditClient(IActorRef siteCommunicationActor, TimeSpan askTimeout)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(siteCommunicationActor);
|
||||
_siteCommunicationActor = siteCommunicationActor;
|
||||
_askTimeout = askTimeout;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var events = new List<AuditEvent>(batch.Events.Count);
|
||||
foreach (var dto in batch.Events)
|
||||
{
|
||||
events.Add(AuditEventDtoMapper.FromDto(dto));
|
||||
}
|
||||
|
||||
// Ask<T> throws AskTimeoutException on timeout and rethrows a
|
||||
// Status.Failure's inner cause — both surface as a thrown exception so
|
||||
// the drain loop keeps the rows Pending. We deliberately do NOT catch.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestAuditEventsReply>(new IngestAuditEventsCommand(events), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var entries = new List<CachedTelemetryEntry>(batch.Packets.Count);
|
||||
foreach (var packet in batch.Packets)
|
||||
{
|
||||
var audit = AuditEventDtoMapper.FromDto(packet.AuditEvent);
|
||||
var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
|
||||
entries.Add(new CachedTelemetryEntry(audit, siteCall));
|
||||
}
|
||||
|
||||
// Same throw-on-failure contract as IngestAuditEventsAsync. The reply
|
||||
// type is IngestCachedTelemetryReply (the central dual-write reply),
|
||||
// distinct from IngestAuditEventsReply.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestCachedTelemetryReply>(new IngestCachedTelemetryCommand(entries), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
private static IngestAck ToAck(IReadOnlyList<Guid> acceptedEventIds)
|
||||
{
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in acceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Mockable abstraction over the central site-audit push surface that
|
||||
/// <see cref="SiteAuditTelemetryActor"/> uses to forward <see cref="AuditEventBatch"/>
|
||||
/// payloads. The production implementation is
|
||||
/// <see cref="ClusterClientSiteAuditClient"/> — a ClusterClient-based client,
|
||||
/// wired in the Host for site roles, that forwards batches to central via the
|
||||
/// site's <c>SiteCommunicationActor</c>. Unit tests substitute via NSubstitute
|
||||
/// against this interface so the actor never needs a live transport.
|
||||
/// </summary>
|
||||
public interface ISiteStreamAuditClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Forwards <paramref name="batch"/> to the central audit-ingest path. The
|
||||
/// returned <see cref="IngestAck"/> carries the <c>accepted_event_ids</c>
|
||||
/// the actor will flip to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// in the site SQLite queue.
|
||||
/// </summary>
|
||||
/// <param name="batch">The batch of audit events to forward.</param>
|
||||
/// <param name="ct">Cancellation token for the operation.</param>
|
||||
Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Forwards the combined <see cref="CachedTelemetryBatch"/> (Audit Log #23)
|
||||
/// to the central cached-telemetry ingest path. Each packet carries both the
|
||||
/// audit row and the operational <c>SiteCalls</c> upsert; central writes both
|
||||
/// in a single MS SQL transaction. Returns the same <see cref="IngestAck"/>
|
||||
/// shape as <see cref="IngestAuditEventsAsync"/> so the site-side forwarder
|
||||
/// can flip the underlying audit rows to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// once central has acknowledged them.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production <see cref="ClusterClientSiteAuditClient"/> forwards over
|
||||
/// the ClusterClient transport; the <see cref="NoOpSiteStreamAuditClient"/>
|
||||
/// DI default (used by central and test composition roots) returns an empty
|
||||
/// ack so no rows are flipped.
|
||||
/// </remarks>
|
||||
/// <param name="batch">The batch of cached-call telemetry packets to forward.</param>
|
||||
/// <param name="ct">Cancellation token for the operation.</param>
|
||||
Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="ISiteStreamAuditClient"/> registered by
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// It is a no-op binding for composition roots that have no
|
||||
/// <c>SiteCommunicationActor</c> — central and test roots. Site roles override
|
||||
/// it in the Host with the ClusterClient-based
|
||||
/// <see cref="ClusterClientSiteAuditClient"/>, which actually forwards audit
|
||||
/// telemetry to central.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Returns an empty <see cref="IngestAck"/> so the
|
||||
/// <see cref="SiteAuditTelemetryActor"/> doesn't flip any rows to
|
||||
/// <c>Forwarded</c> when this NoOp is in effect — rows stay <c>Pending</c>
|
||||
/// until a real client (or a test stub) takes over.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Audit-write paths are best-effort by contract: a NoOp client keeps the
|
||||
/// host running cleanly and is consistent with "audit-write failures never
|
||||
/// abort the user-facing action".
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class NoOpSiteStreamAuditClient : ISiteStreamAuditClient
|
||||
{
|
||||
private static readonly IngestAck EmptyAck = new();
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
// Empty ack — no EventIds will be flipped to Forwarded, so rows stay
|
||||
// Pending until the real ClusterClientSiteAuditClient (or a test stub)
|
||||
// takes over.
|
||||
return Task.FromResult(EmptyAck);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
// Empty ack — same rationale as IngestAuditEventsAsync. The site still
|
||||
// writes the audit + tracking rows to its SQLite stores authoritatively;
|
||||
// central-side state only materialises once the real
|
||||
// ClusterClientSiteAuditClient (or a test stub) is wired in.
|
||||
return Task.FromResult(EmptyAck);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,464 @@
|
||||
using Akka.Actor;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side actor that drains the local SQLite audit queue and pushes Pending
|
||||
/// rows to central via two parallel transports:
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>IngestAuditEvents</c> for the audit-only path —
|
||||
/// sync ApiCall/DbWrite, NotifySend, InboundRequest and similar single-row
|
||||
/// lifecycle events.</description></item>
|
||||
/// <item><description><c>IngestCachedTelemetry</c> for the combined-telemetry
|
||||
/// path — cached-call lifecycle rows (<c>CachedSubmit</c>,
|
||||
/// <c>ApiCallCached</c>/<c>DbWriteCached</c>, <c>CachedResolve</c>) joined
|
||||
/// with the matching <c>OperationTracking</c> row, written at central as a
|
||||
/// single dual-write transaction (AuditLog + SiteCalls).</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The drain self-ticks via two private messages — <c>Drain</c> for the
|
||||
/// audit-only path and <c>CachedDrain</c> for the combined path — each
|
||||
/// scheduled independently. Cadence is options-driven:
|
||||
/// <c>BusyIntervalSeconds</c> when the previous drain found rows (or faulted —
|
||||
/// we want quick recovery), <c>IdleIntervalSeconds</c> when the queue was empty.
|
||||
/// The two drains share the same cadence configuration but advance their own
|
||||
/// timers so a stall on one path does not block the other.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Collaborators are injected as interfaces (<see cref="ISiteAuditQueue"/>,
|
||||
/// <see cref="ISiteStreamAuditClient"/>, optional
|
||||
/// <see cref="IOperationTrackingStore"/>) so unit tests substitute with
|
||||
/// NSubstitute and never touch real SQLite or gRPC. The
|
||||
/// <see cref="IOperationTrackingStore"/> is optional — central composition
|
||||
/// roots and tests that don't exercise the cached path can leave it null, in
|
||||
/// which case the cached-drain scheduler is never armed.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write paths must be fail-safe — a thrown
|
||||
/// exception inside the actor MUST NOT crash it. Both Drain handlers wrap
|
||||
/// their pipelines in a top-level try/catch that logs and re-schedules; the
|
||||
/// actor's <see cref="SupervisorStrategy"/> defaults to
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultStrategy"/>'s Restart for
|
||||
/// child actors — but this actor has no children, so the catch is what
|
||||
/// matters.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// AuditLog-001: wires the previously-unreachable combined-telemetry transport.
|
||||
/// Prior to this the cached audit rows flowed through the audit-only drain via
|
||||
/// <c>IngestAuditEventsAsync</c> and the central <c>OnCachedTelemetryAsync</c>
|
||||
/// dual-write handler was dead production code; the operational <c>SiteCalls</c>
|
||||
/// half was never sent to central.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SiteAuditTelemetryActor : ReceiveActor
|
||||
{
|
||||
private readonly ISiteAuditQueue _queue;
|
||||
private readonly ISiteStreamAuditClient _client;
|
||||
private readonly IOperationTrackingStore? _trackingStore;
|
||||
private readonly SiteAuditTelemetryOptions _options;
|
||||
private readonly ILogger<SiteAuditTelemetryActor> _logger;
|
||||
private ICancelable? _pendingTick;
|
||||
private ICancelable? _pendingCachedTick;
|
||||
// AuditLog-010: per-actor lifecycle CTS so an in-flight drain (queue read,
|
||||
// gRPC push, mark-forwarded write) is actually cancelled when the actor is
|
||||
// stopped — without it, a stuck IngestAuditEventsAsync would hold the
|
||||
// continuation through CoordinatedShutdown's actor-system terminate window.
|
||||
// Cancelled in PostStop; never reset (the actor is single-lifetime).
|
||||
// The same CTS gates the cached-drain pipeline (queue read + tracking
|
||||
// lookup + gRPC push) so both paths observe shutdown cooperatively.
|
||||
private readonly CancellationTokenSource _lifecycleCts = new();
|
||||
|
||||
/// <summary>Initializes the actor with its drain queue, gRPC client, options, and logger.</summary>
|
||||
/// <param name="queue">The site-local SQLite audit queue to drain.</param>
|
||||
/// <param name="client">The gRPC client used to push audit events to central.</param>
|
||||
/// <param name="options">Telemetry options controlling drain intervals and batch size.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="trackingStore">
|
||||
/// Optional site-local operation tracking store. When supplied the actor
|
||||
/// runs the combined-telemetry cached-drain in parallel with the audit-only
|
||||
/// drain; when null (central composition roots, tests that don't exercise
|
||||
/// cached calls) the cached scheduler is never armed and only the
|
||||
/// audit-only drain runs.
|
||||
/// </param>
|
||||
public SiteAuditTelemetryActor(
|
||||
ISiteAuditQueue queue,
|
||||
ISiteStreamAuditClient client,
|
||||
IOptions<SiteAuditTelemetryOptions> options,
|
||||
ILogger<SiteAuditTelemetryActor> logger,
|
||||
IOperationTrackingStore? trackingStore = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(queue);
|
||||
ArgumentNullException.ThrowIfNull(client);
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_queue = queue;
|
||||
_client = client;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_trackingStore = trackingStore;
|
||||
|
||||
ReceiveAsync<Drain>(_ => OnDrainAsync());
|
||||
ReceiveAsync<CachedDrain>(_ => OnCachedDrainAsync());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
// Initial ticks fire on the busy interval so both drains start polling
|
||||
// soon after host startup. A subsequent empty drain will move to the
|
||||
// idle interval naturally.
|
||||
ScheduleNext(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
|
||||
if (_trackingStore is not null)
|
||||
{
|
||||
ScheduleNextCached(TimeSpan.FromSeconds(_options.BusyIntervalSeconds));
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_pendingTick?.Cancel();
|
||||
_pendingCachedTick?.Cancel();
|
||||
// AuditLog-010: cancel any in-flight drain so a stuck queue read or
|
||||
// gRPC push does not hold the continuation past actor stop.
|
||||
try
|
||||
{
|
||||
_lifecycleCts.Cancel();
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// PostStop may run after a prior Dispose path — benign.
|
||||
}
|
||||
_lifecycleCts.Dispose();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private async Task OnDrainAsync()
|
||||
{
|
||||
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
|
||||
// AuditLog-010: route every async dependency call through the
|
||||
// per-actor lifecycle token so PostStop cancellation actually
|
||||
// propagates into the queue read, the gRPC push, and the
|
||||
// mark-forwarded write. OperationCanceledException is swallowed by
|
||||
// the catch-all below.
|
||||
var ct = _lifecycleCts.Token;
|
||||
try
|
||||
{
|
||||
var pending = await _queue.ReadPendingAsync(_options.BatchSize, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (pending.Count == 0)
|
||||
{
|
||||
// No rows — settle into the idle cadence until the next write
|
||||
// bumps us back into the busy cadence.
|
||||
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
|
||||
return;
|
||||
}
|
||||
|
||||
var batch = BuildBatch(pending);
|
||||
|
||||
IngestAck ack;
|
||||
try
|
||||
{
|
||||
ack = await _client.IngestAuditEventsAsync(batch, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// gRPC fault — leave the rows in Pending so the next drain
|
||||
// retries. Bundle D's brief: "On gRPC exception (any), log
|
||||
// Warning, schedule next Drain in BusyIntervalSeconds."
|
||||
_logger.LogWarning(ex,
|
||||
"IngestAuditEvents push failed for {Count} pending events; will retry next drain.",
|
||||
pending.Count);
|
||||
return;
|
||||
}
|
||||
|
||||
var acceptedIds = ParseAcceptedIds(ack);
|
||||
if (acceptedIds.Count > 0)
|
||||
{
|
||||
await _queue.MarkForwardedAsync(acceptedIds, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all so a SQLite hiccup or mapper bug never crashes the
|
||||
// actor. The next tick is still scheduled in the finally block.
|
||||
_logger.LogError(ex, "Unexpected error during audit-log telemetry drain.");
|
||||
}
|
||||
finally
|
||||
{
|
||||
// AuditLog-010: if the actor is already shutting down, do not
|
||||
// arm another tick — the scheduler would fire after PostStop and
|
||||
// the message would land in dead letters.
|
||||
if (!_lifecycleCts.IsCancellationRequested)
|
||||
{
|
||||
ScheduleNext(nextDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-001: combined-telemetry drain. Reads cached-lifecycle audit
|
||||
/// rows, joins each with the matching <see cref="IOperationTrackingStore"/>
|
||||
/// snapshot, builds a <see cref="CachedTelemetryBatch"/>, and pushes via
|
||||
/// <see cref="ISiteStreamAuditClient.IngestCachedTelemetryAsync"/>. Rows
|
||||
/// whose tracking snapshot is missing (race with retention purge / late
|
||||
/// audit row) are logged + skipped — the operational half will be
|
||||
/// re-emitted on the next lifecycle event, and the audit row stays
|
||||
/// <see cref="Commons.Types.Enums.AuditForwardState.Pending"/> so a later
|
||||
/// drain (or reconciliation pull) can revisit it.
|
||||
/// </summary>
|
||||
private async Task OnCachedDrainAsync()
|
||||
{
|
||||
var nextDelay = TimeSpan.FromSeconds(_options.BusyIntervalSeconds);
|
||||
var ct = _lifecycleCts.Token;
|
||||
try
|
||||
{
|
||||
// _trackingStore is non-null by construction here — the cached
|
||||
// scheduler is only armed when it was supplied (see PreStart).
|
||||
// Defensive check kept for clarity and to silence the compiler's
|
||||
// null-flow analysis.
|
||||
if (_trackingStore is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var pending = await _queue
|
||||
.ReadPendingCachedTelemetryAsync(_options.BatchSize, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (pending.Count == 0)
|
||||
{
|
||||
nextDelay = TimeSpan.FromSeconds(_options.IdleIntervalSeconds);
|
||||
return;
|
||||
}
|
||||
|
||||
var batch = new CachedTelemetryBatch();
|
||||
var emittedEventIds = new List<Guid>(pending.Count);
|
||||
|
||||
foreach (var auditRow in pending)
|
||||
{
|
||||
if (auditRow.CorrelationId is null)
|
||||
{
|
||||
// CorrelationId carries the TrackedOperationId for cached
|
||||
// rows — see CachedCallLifecycleBridge.BuildPacket. Without
|
||||
// it we can't look up the tracking row; log + skip so the
|
||||
// bad row doesn't block the rest of the batch. The audit
|
||||
// row stays Pending (still not in emittedEventIds) and
|
||||
// central reconciliation will pick it up.
|
||||
_logger.LogWarning(
|
||||
"Cached-telemetry drain: audit row {EventId} ({Kind}) has no CorrelationId; skipping.",
|
||||
auditRow.EventId, auditRow.Kind);
|
||||
continue;
|
||||
}
|
||||
|
||||
TrackingStatusSnapshot? snapshot;
|
||||
try
|
||||
{
|
||||
snapshot = await _trackingStore
|
||||
.GetStatusAsync(new TrackedOperationId(auditRow.CorrelationId.Value), ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// A tracking-store throw must NOT abort the rest of the
|
||||
// batch — the audit half is best-effort. Log and skip
|
||||
// this row; it stays Pending for the next drain.
|
||||
_logger.LogWarning(ex,
|
||||
"Cached-telemetry drain: tracking lookup threw for {EventId} (TrackedOperationId {Tid}); skipping.",
|
||||
auditRow.EventId, auditRow.CorrelationId);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (snapshot is null)
|
||||
{
|
||||
// No tracking row — possible if the audit row is older
|
||||
// than the tracking retention window, or the tracking
|
||||
// store was reset. The audit half remains valid and will
|
||||
// be picked up by central reconciliation; skip the
|
||||
// combined push for this row.
|
||||
_logger.LogWarning(
|
||||
"Cached-telemetry drain: no tracking snapshot for {EventId} (TrackedOperationId {Tid}); skipping.",
|
||||
auditRow.EventId, auditRow.CorrelationId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var packet = BuildCachedPacket(auditRow, snapshot);
|
||||
batch.Packets.Add(packet);
|
||||
emittedEventIds.Add(auditRow.EventId);
|
||||
}
|
||||
|
||||
if (batch.Packets.Count == 0)
|
||||
{
|
||||
// Every row in this read was skipped (no CorrelationId / no
|
||||
// tracking snapshot). Leave them Pending and try again next
|
||||
// drain — the underlying race normally resolves on its own.
|
||||
return;
|
||||
}
|
||||
|
||||
IngestAck ack;
|
||||
try
|
||||
{
|
||||
ack = await _client.IngestCachedTelemetryAsync(batch, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"IngestCachedTelemetry push failed for {Count} cached events; will retry next drain.",
|
||||
batch.Packets.Count);
|
||||
return;
|
||||
}
|
||||
|
||||
var acceptedIds = ParseAcceptedIds(ack);
|
||||
if (acceptedIds.Count > 0)
|
||||
{
|
||||
await _queue.MarkForwardedAsync(acceptedIds, ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unexpected error during cached-telemetry drain.");
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (!_lifecycleCts.IsCancellationRequested && _trackingStore is not null)
|
||||
{
|
||||
ScheduleNextCached(nextDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static AuditEventBatch BuildBatch(IReadOnlyList<AuditEvent> events)
|
||||
{
|
||||
var batch = new AuditEventBatch();
|
||||
foreach (var e in events)
|
||||
{
|
||||
batch.Events.Add(AuditEventDtoMapper.ToDto(e));
|
||||
}
|
||||
return batch;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AuditLog-001: build the combined wire packet from one cached audit row
|
||||
/// + its matching operational tracking snapshot. The operational state
|
||||
/// reflects the latest tracking row at emission time (not the per-event
|
||||
/// status the audit row implies) because central's <c>SiteCalls</c>
|
||||
/// upsert is monotonic — it never rolls back. The audit row preserves
|
||||
/// per-event lifecycle granularity for the audit trail.
|
||||
/// </summary>
|
||||
private static CachedTelemetryPacket BuildCachedPacket(
|
||||
AuditEvent auditRow, TrackingStatusSnapshot snapshot)
|
||||
{
|
||||
var sourceSite = auditRow.SourceSiteId ?? string.Empty;
|
||||
// Channel string form mirrors the AuditChannel-to-string convention used
|
||||
// by SiteCallOperational + CachedCallLifecycleBridge.BuildPacket.
|
||||
var channelString = auditRow.Channel.ToString();
|
||||
var target = auditRow.Target ?? snapshot.TargetSummary ?? string.Empty;
|
||||
|
||||
var operationalDto = new SiteCallOperationalDto
|
||||
{
|
||||
TrackedOperationId = snapshot.Id.Value.ToString("D"),
|
||||
Channel = channelString,
|
||||
Target = target,
|
||||
SourceSite = sourceSite,
|
||||
SourceNode = snapshot.SourceNode ?? string.Empty,
|
||||
Status = snapshot.Status,
|
||||
RetryCount = snapshot.RetryCount,
|
||||
LastError = snapshot.LastError ?? string.Empty,
|
||||
CreatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.CreatedAtUtc)),
|
||||
UpdatedAtUtc = Timestamp.FromDateTime(EnsureUtc(snapshot.UpdatedAtUtc)),
|
||||
};
|
||||
if (snapshot.HttpStatus.HasValue)
|
||||
{
|
||||
operationalDto.HttpStatus = snapshot.HttpStatus.Value;
|
||||
}
|
||||
if (snapshot.TerminalAtUtc.HasValue)
|
||||
{
|
||||
operationalDto.TerminalAtUtc =
|
||||
Timestamp.FromDateTime(EnsureUtc(snapshot.TerminalAtUtc.Value));
|
||||
}
|
||||
|
||||
return new CachedTelemetryPacket
|
||||
{
|
||||
AuditEvent = AuditEventDtoMapper.ToDto(auditRow),
|
||||
Operational = operationalDto,
|
||||
};
|
||||
}
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
private static IReadOnlyList<Guid> ParseAcceptedIds(IngestAck ack)
|
||||
{
|
||||
if (ack.AcceptedEventIds.Count == 0)
|
||||
{
|
||||
return Array.Empty<Guid>();
|
||||
}
|
||||
|
||||
var list = new List<Guid>(ack.AcceptedEventIds.Count);
|
||||
foreach (var raw in ack.AcceptedEventIds)
|
||||
{
|
||||
if (Guid.TryParse(raw, out var id))
|
||||
{
|
||||
list.Add(id);
|
||||
}
|
||||
// Malformed ids are ignored — central should never emit them, but
|
||||
// we refuse to crash the actor over a bad string.
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private void ScheduleNext(TimeSpan delay)
|
||||
{
|
||||
_pendingTick?.Cancel();
|
||||
_pendingTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
|
||||
delay,
|
||||
Self,
|
||||
Drain.Instance,
|
||||
Self);
|
||||
}
|
||||
|
||||
private void ScheduleNextCached(TimeSpan delay)
|
||||
{
|
||||
_pendingCachedTick?.Cancel();
|
||||
_pendingCachedTick = Context.System.Scheduler.ScheduleTellOnceCancelable(
|
||||
delay,
|
||||
Self,
|
||||
CachedDrain.Instance,
|
||||
Self);
|
||||
}
|
||||
|
||||
/// <summary>Self-tick message that triggers an audit-only drain cycle.</summary>
|
||||
private sealed class Drain
|
||||
{
|
||||
public static readonly Drain Instance = new();
|
||||
private Drain() { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Self-tick message that triggers a combined-telemetry drain cycle.
|
||||
/// AuditLog-001: introduced alongside the cached-drain to keep the two
|
||||
/// paths' cadences independent — a stall on one does not block the other.
|
||||
/// </summary>
|
||||
private sealed class CachedDrain
|
||||
{
|
||||
public static readonly CachedDrain Instance = new();
|
||||
private CachedDrain() { }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the site-side <see cref="SiteAuditTelemetryActor"/> drain
|
||||
/// loop. Defaults mirror Bundle D's plan: drain every 5 s while rows are
|
||||
/// flowing (busy), every 30 s when the queue is empty (idle).
|
||||
/// </summary>
|
||||
public sealed class SiteAuditTelemetryOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum number of <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows read from the site SQLite queue and pushed in a single gRPC batch.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>
|
||||
/// Delay between drains when the previous drain found at least one Pending
|
||||
/// row OR the previous push faulted. Re-drain quickly to keep telemetry
|
||||
/// flowing and to retry transient gRPC errors.
|
||||
/// </summary>
|
||||
public int BusyIntervalSeconds { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Delay between drains when the previous drain found no Pending rows.
|
||||
/// Longer interval avoids hammering an idle SQLite + gRPC channel.
|
||||
/// </summary>
|
||||
public int IdleIntervalSeconds { get; set; } = 30;
|
||||
}
|
||||
Reference in New Issue
Block a user