refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,543 @@
+using System.Collections.Concurrent;
+using System.Threading.Channels;
+using Akka.Actor;
+using Grpc.Core;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using GrpcStatus = Grpc.Core.Status;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
+
+/// <summary>
+/// gRPC service that accepts instance stream subscriptions from central nodes.
+/// Creates a StreamRelayActor per subscription to bridge Akka domain events
+/// through a Channel&lt;T&gt; to the gRPC response stream.
+/// </summary>
+public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
+{
+    private readonly ISiteStreamSubscriber _streamSubscriber;
+    private ActorSystem? _actorSystem;
+    private readonly ILogger<SiteStreamGrpcServer> _logger;
+    private readonly ConcurrentDictionary<string, StreamEntry> _activeStreams = new();
+    private readonly int _maxConcurrentStreams;
+    private readonly TimeSpan _maxStreamLifetime;
+    private volatile bool _ready;
+    // Host-017 / REQ-HOST-7: flipped by CancelAllStreams() when the host enters
+    // CoordinatedShutdown so SubscribeInstance refuses new streams with
+    // Unavailable before the actor system tears down. Strictly monotonic — once
+    // true, never reset (the server is single-lifetime per host).
+    private volatile bool _shuttingDown;
+    private long _actorCounter;
+    // Audit Log (#23 M2): central-side ingest actor proxy. Set by the host
+    // after the cluster singleton starts (see Bundle E wiring). When null the
+    // IngestAuditEvents RPC replies with an empty IngestAck so sites can
+    // safely retry — wiring-incomplete is treated as transient, never fatal.
+    private IActorRef? _auditIngestActor;
+    // Per Bundle D's brief — Ask timeout is 30 s. The ingest actor's repo
+    // calls are sub-100 ms in steady state; a generous timeout absorbs a slow
+    // MSSQL connection without surfacing as a gRPC failure on a healthy site.
+    private static readonly TimeSpan AuditIngestAskTimeout = TimeSpan.FromSeconds(30);
+    // Audit Log (#23 M6): site-local queue handed in by AkkaHostedService on
+    // site roles so the central reconciliation puller's PullAuditEvents RPC
+    // can read Pending/Forwarded rows. Null when not wired (e.g. central-only
+    // host or test composing the server in isolation) — the handler treats
+    // the missing queue as "nothing to ship" and returns an empty response so
+    // central retries on its next reconciliation cycle.
+    private ISiteAuditQueue? _siteAuditQueue;
+
+    /// <summary>
+    /// Test-only constructor — kept <c>internal</c> so the DI container sees a
+    /// single public constructor and is not faced with an ambiguous choice.
+    /// </summary>
+    /// <param name="streamSubscriber">The stream subscriber for managing subscriptions.</param>
+    /// <param name="logger">The logger instance.</param>
+    /// <param name="maxConcurrentStreams">The maximum concurrent streams (default 100).</param>
+    internal SiteStreamGrpcServer(
+        ISiteStreamSubscriber streamSubscriber,
+        ILogger<SiteStreamGrpcServer> logger,
+        int maxConcurrentStreams = 100)
+        : this(streamSubscriber, logger, maxConcurrentStreams, TimeSpan.FromHours(4))
+    {
+    }
+
+    /// <summary>
+    /// DI constructor — binds <see cref="CommunicationOptions.GrpcMaxConcurrentStreams"/>
+    /// and <see cref="CommunicationOptions.GrpcMaxStreamLifetime"/> so the documented
+    /// concurrency limit and the 4-hour zombie-stream session timeout are honoured
+    /// rather than hard-coded.
+    /// </summary>
+    /// <param name="streamSubscriber">The stream subscriber for managing subscriptions.</param>
+    /// <param name="logger">The logger instance.</param>
+    /// <param name="options">Communication options containing stream limits and timeouts.</param>
+    public SiteStreamGrpcServer(
+        ISiteStreamSubscriber streamSubscriber,
+        ILogger<SiteStreamGrpcServer> logger,
+        IOptions<CommunicationOptions> options)
+        : this(streamSubscriber, logger,
+            options.Value.GrpcMaxConcurrentStreams,
+            options.Value.GrpcMaxStreamLifetime)
+    {
+    }
+
+    private SiteStreamGrpcServer(
+        ISiteStreamSubscriber streamSubscriber,
+        ILogger<SiteStreamGrpcServer> logger,
+        int maxConcurrentStreams,
+        TimeSpan maxStreamLifetime)
+    {
+        _streamSubscriber = streamSubscriber;
+        _logger = logger;
+        _maxConcurrentStreams = maxConcurrentStreams;
+        _maxStreamLifetime = maxStreamLifetime;
+    }
+
+    /// <summary>
+    /// Marks the server as ready to accept subscriptions and injects the ActorSystem.
+    /// Called after the site runtime actor system is fully initialized.
+    /// The ActorSystem is set here rather than via the constructor so that
+    /// the gRPC server can be created by DI before the actor system exists.
+    /// </summary>
+    /// <param name="actorSystem">The initialized Akka actor system.</param>
+    public void SetReady(ActorSystem actorSystem)
+    {
+        _actorSystem = actorSystem;
+        _ready = true;
+    }
+
+    /// <summary>
+    /// Hands the central-side <c>AuditLogIngestActor</c> proxy to the gRPC
+    /// server so the <see cref="IngestAuditEvents"/> RPC can route incoming
+    /// site batches. Audit Log (#23) M2 wiring point — mirrors the way
+    /// <c>CommunicationService.SetNotificationOutbox</c> takes the Notification
+    /// Outbox singleton proxy. Bundle E supplies the actor after the cluster
+    /// singleton starts.
+    /// </summary>
+    /// <param name="proxy">The audit log ingest actor proxy.</param>
+    public void SetAuditIngestActor(IActorRef proxy)
+    {
+        _auditIngestActor = proxy;
+    }
+
+    /// <summary>
+    /// Hands the site-local <see cref="ISiteAuditQueue"/> (the same
+    /// <c>SqliteAuditWriter</c> singleton that backs <see cref="IAuditWriter"/>
+    /// on the script thread) to the gRPC server so the M6
+    /// <see cref="PullAuditEvents"/> RPC can serve central's reconciliation
+    /// pulls. Mirrors <see cref="SetAuditIngestActor"/>: wired post-construction
+    /// because the queue and the gRPC server are both DI singletons brought up
+    /// in independent orders on site startup.
+    /// </summary>
+    /// <param name="queue">The site audit queue for serving reconciliation pulls.</param>
+    public void SetSiteAuditQueue(ISiteAuditQueue queue)
+    {
+        _siteAuditQueue = queue;
+    }
+
+    /// <summary>
+    /// Host-017 / REQ-HOST-7: signals the gRPC server to begin its part of the
+    /// site shutdown sequence — refuse new <see cref="SubscribeInstance"/>
+    /// streams with <see cref="StatusCode.Unavailable"/> and cancel every
+    /// active stream so its <c>await foreach</c> observes
+    /// <see cref="OperationCanceledException"/> and the response stream
+    /// completes with <c>Cancelled</c> on the client. Idempotent — safe to call
+    /// more than once. Invoked from the site host's
+    /// <c>IHostApplicationLifetime.ApplicationStopping</c> callback BEFORE
+    /// Akka's <c>CoordinatedShutdown</c> runs, so in-flight clients get a
+    /// clean cancellation they can reconnect on rather than a silent stream
+    /// that only times out via gRPC keepalive.
+    /// </summary>
+    public void CancelAllStreams()
+    {
+        _shuttingDown = true;
+        foreach (var entry in _activeStreams.Values)
+        {
+            try
+            {
+                entry.Cts.Cancel();
+            }
+            catch (ObjectDisposedException)
+            {
+                // Already cleaned up by its own finally — nothing to do.
+            }
+        }
+    }
+
+    /// <summary>
+    /// Host-017: exposed for test assertions on the shutdown state.
+    /// </summary>
+    internal bool IsShuttingDown => _shuttingDown;
+
+    /// <summary>
+    /// Number of currently active streaming subscriptions. Exposed for diagnostics.
+    /// </summary>
+    public int ActiveStreamCount => _activeStreams.Count;
+
+    /// <summary>Effective max concurrent stream limit. Exposed for tests.</summary>
+    internal int MaxConcurrentStreams => _maxConcurrentStreams;
+
+    /// <summary>Effective per-stream session lifetime. Exposed for tests.</summary>
+    internal TimeSpan MaxStreamLifetime => _maxStreamLifetime;
+
+    /// <inheritdoc />
+    public override async Task SubscribeInstance(
+        InstanceStreamRequest request,
+        IServerStreamWriter<SiteStreamEvent> responseStream,
+        ServerCallContext context)
+    {
+        if (!_ready)
+            throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server not ready"));
+
+        // Host-017 / REQ-HOST-7: refuse new subscriptions during shutdown so
+        // CoordinatedShutdown can quiesce without racing fresh streams.
+        if (_shuttingDown)
+            throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server shutting down"));
+
+        // Communication-014: correlation_id arrives off the wire on a public gRPC
+        // endpoint and is used (below) to compose an Akka actor name. Akka actor names
+        // have a restricted character set — a id containing '/', whitespace, or other
+        // disallowed characters would make ActorOf throw InvalidActorNameException,
+        // escaping as an unhandled RPC fault. Reject unsafe ids cleanly up front.
+        if (string.IsNullOrEmpty(request.CorrelationId) ||
+            !ActorPath.IsValidPathElement(request.CorrelationId))
+        {
+            throw new RpcException(new GrpcStatus(
+                StatusCode.InvalidArgument, "correlation_id is missing or not a valid identifier"));
+        }
+
+        // Duplicate prevention -- cancel existing stream for this correlationId
+        if (_activeStreams.TryRemove(request.CorrelationId, out var existingEntry))
+        {
+            existingEntry.Cts.Cancel();
+            existingEntry.Cts.Dispose();
+        }
+
+        // Check max concurrent streams after duplicate removal
+        if (_activeStreams.Count >= _maxConcurrentStreams)
+            throw new RpcException(new GrpcStatus(StatusCode.ResourceExhausted, "Max concurrent streams reached"));
+
+        using var streamCts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
+        // Session timeout (design doc "gRPC Connection Keepalive": 4-hour third layer
+        // of dead-client detection) — forces a long-lived zombie stream to terminate
+        // even if keepalive PINGs never detect the loss.
+        if (_maxStreamLifetime > TimeSpan.Zero && _maxStreamLifetime != Timeout.InfiniteTimeSpan)
+            streamCts.CancelAfter(_maxStreamLifetime);
+        var entry = new StreamEntry(streamCts);
+        _activeStreams[request.CorrelationId] = entry;
+
+        var channel = Channel.CreateBounded<SiteStreamEvent>(
+            new BoundedChannelOptions(1000) { FullMode = BoundedChannelFullMode.DropOldest });
+
+        var actorSeq = Interlocked.Increment(ref _actorCounter);
+        var relayActor = _actorSystem!.ActorOf(
+            Props.Create(typeof(Actors.StreamRelayActor), request.CorrelationId, channel.Writer),
+            $"stream-relay-{request.CorrelationId}-{actorSeq}");
+
+        // Communication-021: the previous code called _streamSubscriber.Subscribe
+        // OUTSIDE the try block that owns relay-actor cleanup. If Subscribe threw
+        // (stale instance name, index lookup fault, site runtime shutting down),
+        // the freshly-created relay actor, the _activeStreams entry, the
+        // StreamEntry.Cts, and the Channel<SiteStreamEvent> all leaked because the
+        // finally never ran. Wrap Subscribe in its own try so any throw deterministically
+        // stops the relay actor, removes the activeStreams entry, and completes the
+        // channel before the RpcException escapes to the caller.
+        string subscriptionId;
+        try
+        {
+            subscriptionId = _streamSubscriber.Subscribe(request.InstanceUniqueName, relayActor);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex,
+                "Subscribe failed for {Instance} (correlation {CorrelationId}); cleaning up relay actor.",
+                request.InstanceUniqueName, request.CorrelationId);
+            _actorSystem!.Stop(relayActor);
+            channel.Writer.TryComplete();
+            _activeStreams.TryRemove(
+                new KeyValuePair<string, StreamEntry>(request.CorrelationId, entry));
+            throw;
+        }
+
+        _logger.LogInformation(
+            "Stream {CorrelationId} started for {Instance} (subscription {SubscriptionId})",
+            request.CorrelationId, request.InstanceUniqueName, subscriptionId);
+
+        try
+        {
+            await foreach (var evt in channel.Reader.ReadAllAsync(streamCts.Token))
+            {
+                await responseStream.WriteAsync(evt, streamCts.Token);
+            }
+        }
+        catch (OperationCanceledException)
+        {
+            // Normal cancellation (client disconnect or duplicate replacement)
+        }
+        finally
+        {
+            _streamSubscriber.RemoveSubscriber(relayActor);
+            _actorSystem!.Stop(relayActor);
+            channel.Writer.TryComplete();
+
+            // Only remove our own entry -- a replacement stream may have already taken the slot
+            _activeStreams.TryRemove(
+                new KeyValuePair<string, StreamEntry>(request.CorrelationId, entry));
+
+            _logger.LogInformation(
+                "Stream {CorrelationId} for {Instance} ended",
+                request.CorrelationId, request.InstanceUniqueName);
+        }
+    }
+
+    /// <summary>
+    /// Audit Log (#23) M2 site→central push RPC. Decodes a site batch into
+    /// <see cref="AuditEvent"/> rows, Asks the central <c>AuditLogIngestActor</c>
+    /// proxy to persist them, and echoes the accepted EventIds back so the site
+    /// can flip its local rows to <c>Forwarded</c>.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// The DTO→entity conversion uses the shared <see cref="AuditEventDtoMapper"/>
+    /// (hosted in <c>ZB.MOM.WW.ScadaBridge.Communication</c> so both this server and
+    /// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> share one implementation without a
+    /// project-reference cycle).
+    /// </para>
+    /// <para>
+    /// When <see cref="_auditIngestActor"/> is not yet wired (host startup
+    /// race window), the RPC returns an empty <see cref="IngestAck"/> rather
+    /// than failing — the site treats the missing ack as a transient outcome
+    /// and retries on the next drain, which is the desired idempotent
+    /// behaviour.
+    /// </para>
+    /// </remarks>
+    /// <inheritdoc />
+    /// <param name="request">The audit event batch to ingest.</param>
+    /// <param name="context">The server call context.</param>
+    public override async Task<IngestAck> IngestAuditEvents(
+        AuditEventBatch request,
+        ServerCallContext context)
+    {
+        // Empty batch is a no-op; reply immediately so the client moves on.
+        if (request.Events.Count == 0)
+        {
+            return new IngestAck();
+        }
+
+        var actor = _auditIngestActor;
+        if (actor is null)
+        {
+            // Wiring incomplete (host startup race). Sites treat an empty
+            // ack as "nothing was acked, leave rows Pending, retry next
+            // drain" — exactly the right behaviour during host bring-up.
+            _logger.LogWarning(
+                "IngestAuditEvents received {Count} events before SetAuditIngestActor was called; returning empty ack.",
+                request.Events.Count);
+            return new IngestAck();
+        }
+
+        var entities = new List<AuditEvent>(request.Events.Count);
+        foreach (var dto in request.Events)
+        {
+            entities.Add(AuditEventDtoMapper.FromDto(dto));
+        }
+
+        var cmd = new IngestAuditEventsCommand(entities);
+        IngestAuditEventsReply reply;
+        try
+        {
+            reply = await actor.Ask<IngestAuditEventsReply>(
+                cmd, AuditIngestAskTimeout, context.CancellationToken);
+        }
+        catch (Exception ex)
+        {
+            // Audit ingest is best-effort; failing this RPC at the gRPC layer
+            // would surface as a transport error and force the site to retry
+            // (which it would do anyway). Logging + an empty ack keeps the
+            // semantics consistent with the "wiring incomplete" path above.
+            _logger.LogError(ex,
+                "AuditLogIngestActor Ask failed for batch of {Count} events; returning empty ack.",
+                request.Events.Count);
+            return new IngestAck();
+        }
+
+        var ack = new IngestAck();
+        foreach (var id in reply.AcceptedEventIds)
+        {
+            ack.AcceptedEventIds.Add(id.ToString());
+        }
+        return ack;
+    }
+
+    /// <summary>
+    /// Audit Log (#23) M3 site→central combined-telemetry push RPC. Decodes a
+    /// batch of <see cref="CachedTelemetryPacket"/> entries into matched
+    /// (AuditEvent, SiteCall) pairs, Asks the central <c>AuditLogIngestActor</c>
+    /// proxy to persist them in dual-write transactions, and echoes the
+    /// AuditEvent EventIds that committed back so the site can flip its local
+    /// rows to <c>Forwarded</c>.
+    /// </summary>
+    /// <remarks>
+    /// Same wiring-incomplete fallback as <see cref="IngestAuditEvents"/>: when
+    /// the actor proxy has not been set the RPC replies with an empty ack so
+    /// sites treat the outcome as transient and retry, never a hard fault.
+    /// </remarks>
+    /// <inheritdoc />
+    /// <param name="request">The cached telemetry batch to ingest.</param>
+    /// <param name="context">The server call context.</param>
+    public override async Task<IngestAck> IngestCachedTelemetry(
+        CachedTelemetryBatch request,
+        ServerCallContext context)
+    {
+        if (request.Packets.Count == 0)
+        {
+            return new IngestAck();
+        }
+
+        var actor = _auditIngestActor;
+        if (actor is null)
+        {
+            _logger.LogWarning(
+                "IngestCachedTelemetry received {Count} packets before SetAuditIngestActor was called; returning empty ack.",
+                request.Packets.Count);
+            return new IngestAck();
+        }
+
+        var entries = new List<CachedTelemetryEntry>(request.Packets.Count);
+        foreach (var packet in request.Packets)
+        {
+            var auditEvent = AuditEventDtoMapper.FromDto(packet.AuditEvent);
+            var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
+            entries.Add(new CachedTelemetryEntry(auditEvent, siteCall));
+        }
+
+        var cmd = new IngestCachedTelemetryCommand(entries);
+        IngestCachedTelemetryReply reply;
+        try
+        {
+            reply = await actor.Ask<IngestCachedTelemetryReply>(
+                cmd, AuditIngestAskTimeout, context.CancellationToken);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex,
+                "AuditLogIngestActor Ask failed for combined telemetry batch of {Count} packets; returning empty ack.",
+                request.Packets.Count);
+            return new IngestAck();
+        }
+
+        var ack = new IngestAck();
+        foreach (var id in reply.AcceptedEventIds)
+        {
+            ack.AcceptedEventIds.Add(id.ToString());
+        }
+        return ack;
+    }
+
+    /// <summary>
+    /// Audit Log (#23) M6 reconciliation pull RPC. Central asks the site for any
+    /// AuditLog rows whose <c>OccurredAtUtc &gt;= since_utc</c> and whose
+    /// <c>ForwardState</c> is still <c>Pending</c> or <c>Forwarded</c> (i.e. not
+    /// yet confirmed reconciled), bounded by <c>batch_size</c>. The site responds
+    /// with the rows AND flips them to
+    /// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Reconciled"/>
+    /// AFTER serializing the response. The flip is best-effort — if it fails
+    /// (e.g. SQLite disposed mid-call), rows stay Pending/Forwarded and central
+    /// pulls them again on the next reconciliation cycle. Idempotent.
+    /// </summary>
+    /// <remarks>
+    /// When <see cref="_siteAuditQueue"/> is not wired (central-only host or a
+    /// composition-root test exercising the server in isolation) the RPC returns
+    /// an empty response — central treats that as "nothing to ship" and retries
+    /// on its next cycle, which is the same self-healing semantics as the
+    /// SetAuditIngestActor wiring race window.
+    /// </remarks>
+    /// <inheritdoc />
+    /// <param name="request">The pull request with time bounds and batch size.</param>
+    /// <param name="context">The server call context.</param>
+    public override async Task<PullAuditEventsResponse> PullAuditEvents(
+        PullAuditEventsRequest request,
+        ServerCallContext context)
+    {
+        var queue = _siteAuditQueue;
+        if (queue is null)
+        {
+            _logger.LogWarning(
+                "PullAuditEvents invoked before SetSiteAuditQueue was called; returning empty response.");
+            return new PullAuditEventsResponse();
+        }
+
+        if (request.BatchSize <= 0)
+        {
+            // Mirrors the SubscribeInstance guard: reject malformed requests
+            // cleanly with InvalidArgument so the caller doesn't see a generic
+            // RpcException from the underlying SQLite parameter validation.
+            throw new RpcException(new GrpcStatus(
+                StatusCode.InvalidArgument, "batch_size must be > 0"));
+        }
+
+        // sinceUtc defaults to DateTime.MinValue when the wrapper is absent —
+        // i.e. "pull from the beginning of recorded history", which is the
+        // intended behaviour for the very first reconciliation cycle.
+        var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue;
+
+        IReadOnlyList<AuditEvent> events;
+        try
+        {
+            events = await queue.ReadPendingSinceAsync(
+                since, request.BatchSize, context.CancellationToken);
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex,
+                "ReadPendingSinceAsync failed for since={Since} batch={Batch}; returning empty response.",
+                since, request.BatchSize);
+            return new PullAuditEventsResponse();
+        }
+
+        var response = new PullAuditEventsResponse
+        {
+            // batch_size saturated → tell central to issue a follow-up pull
+            // with an advanced cursor. The site doesn't compute the cursor —
+            // central walks it forward from the last returned OccurredAtUtc.
+            MoreAvailable = events.Count >= request.BatchSize,
+        };
+        foreach (var evt in events)
+        {
+            response.Events.Add(AuditEventDtoMapper.ToDto(evt));
+        }
+
+        // Flip to Reconciled AFTER projecting the response so a fault below the
+        // try/catch (mid-response, mid-flip) leaves the rows in Pending/Forwarded
+        // and central pulls them again next cycle. The flip itself is
+        // best-effort — its failure is a warning, not a fault, because central
+        // will dedup on EventId on the next pull.
+        var ids = new List<Guid>(events.Count);
+        foreach (var evt in events)
+        {
+            ids.Add(evt.EventId);
+        }
+
+        if (ids.Count > 0)
+        {
+            try
+            {
+                await queue.MarkReconciledAsync(ids, context.CancellationToken);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex,
+                    "MarkReconciledAsync failed after PullAuditEvents response of {Count} rows; rows stay Pending for retry.",
+                    ids.Count);
+            }
+        }
+
+        return response;
+    }
+
+    /// <summary>
+    /// Tracks a single active stream so cleanup only removes its own entry.
+    /// </summary>
+    private sealed record StreamEntry(CancellationTokenSource Cts);
+}