refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,128 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using Timestamp = Google.Protobuf.WellKnownTypes.Timestamp;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Canonical bridge for Audit Log (#23) rows between the in-process
|
||||
/// <see cref="AuditEvent"/> record and the wire-format <see cref="AuditEventDto"/>
|
||||
/// exchanged over the <c>IngestAuditEvents</c>, <c>IngestCachedTelemetry</c> and
|
||||
/// <c>PullAuditEvents</c> RPCs.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This mapper lives in <c>ZB.MOM.WW.ScadaBridge.Communication</c> (which owns the generated
|
||||
/// <see cref="AuditEventDto"/> and references <c>Commons</c> for
|
||||
/// <see cref="AuditEvent"/>) so both <c>SiteStreamGrpcServer</c> and
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> can share one implementation without the
|
||||
/// project-reference cycle that would result from hosting it in
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> (AuditLog → Communication, never the reverse).
|
||||
/// </para>
|
||||
/// <para><b>Lossy by design:</b> the proto contract intentionally omits two fields.</para>
|
||||
/// <list type="bullet">
|
||||
/// <item><see cref="AuditEvent.ForwardState"/> — site-local SQLite state, never travels.</item>
|
||||
/// <item><see cref="AuditEvent.IngestedAtUtc"/> — central-set at ingest time, not at the site.</item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// String nullability convention: proto3 scalar strings cannot be absent, so nullable
|
||||
/// .NET strings round-trip as empty strings on the wire. Nullable integers use the
|
||||
/// <c>Int32Value</c> wrapper so they preserve true null semantics.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static class AuditEventDtoMapper
|
||||
{
|
||||
/// <summary>
|
||||
/// Projects an <see cref="AuditEvent"/> into its wire-format DTO. Null reference
|
||||
/// fields collapse to empty strings; null integer fields leave the wrapper unset.
|
||||
/// </summary>
|
||||
/// <param name="evt">The audit event to project to wire format.</param>
|
||||
public static AuditEventDto ToDto(AuditEvent evt)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
var dto = new AuditEventDto
|
||||
{
|
||||
EventId = evt.EventId.ToString(),
|
||||
OccurredAtUtc = Timestamp.FromDateTime(EnsureUtc(evt.OccurredAtUtc)),
|
||||
Channel = evt.Channel.ToString(),
|
||||
Kind = evt.Kind.ToString(),
|
||||
CorrelationId = evt.CorrelationId?.ToString() ?? string.Empty,
|
||||
ExecutionId = evt.ExecutionId?.ToString() ?? string.Empty,
|
||||
ParentExecutionId = evt.ParentExecutionId?.ToString() ?? string.Empty,
|
||||
SourceSiteId = evt.SourceSiteId ?? string.Empty,
|
||||
SourceNode = evt.SourceNode ?? string.Empty,
|
||||
SourceInstanceId = evt.SourceInstanceId ?? string.Empty,
|
||||
SourceScript = evt.SourceScript ?? string.Empty,
|
||||
Actor = evt.Actor ?? string.Empty,
|
||||
Target = evt.Target ?? string.Empty,
|
||||
Status = evt.Status.ToString(),
|
||||
ErrorMessage = evt.ErrorMessage ?? string.Empty,
|
||||
ErrorDetail = evt.ErrorDetail ?? string.Empty,
|
||||
RequestSummary = evt.RequestSummary ?? string.Empty,
|
||||
ResponseSummary = evt.ResponseSummary ?? string.Empty,
|
||||
PayloadTruncated = evt.PayloadTruncated,
|
||||
Extra = evt.Extra ?? string.Empty
|
||||
};
|
||||
|
||||
if (evt.HttpStatus.HasValue)
|
||||
{
|
||||
dto.HttpStatus = evt.HttpStatus.Value;
|
||||
}
|
||||
|
||||
if (evt.DurationMs.HasValue)
|
||||
{
|
||||
dto.DurationMs = evt.DurationMs.Value;
|
||||
}
|
||||
|
||||
return dto;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reconstructs an <see cref="AuditEvent"/> from its wire-format DTO. Empty strings
|
||||
/// rehydrate as null reference values; absent integer wrappers stay null.
|
||||
/// <see cref="AuditEvent.ForwardState"/> and <see cref="AuditEvent.IngestedAtUtc"/>
|
||||
/// are intentionally left null — the central ingest actor sets the latter.
|
||||
/// </summary>
|
||||
/// <param name="dto">The wire-format DTO to reconstruct into an <see cref="AuditEvent"/>.</param>
|
||||
public static AuditEvent FromDto(AuditEventDto dto)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(dto);
|
||||
|
||||
return new AuditEvent
|
||||
{
|
||||
EventId = Guid.Parse(dto.EventId),
|
||||
OccurredAtUtc = DateTime.SpecifyKind(dto.OccurredAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
IngestedAtUtc = null,
|
||||
Channel = Enum.Parse<AuditChannel>(dto.Channel),
|
||||
Kind = Enum.Parse<AuditKind>(dto.Kind),
|
||||
CorrelationId = NullIfEmpty(dto.CorrelationId) is { } cid ? Guid.Parse(cid) : null,
|
||||
ExecutionId = NullIfEmpty(dto.ExecutionId) is { } eid ? Guid.Parse(eid) : null,
|
||||
ParentExecutionId = NullIfEmpty(dto.ParentExecutionId) is { } pid ? Guid.Parse(pid) : null,
|
||||
SourceSiteId = NullIfEmpty(dto.SourceSiteId),
|
||||
SourceNode = NullIfEmpty(dto.SourceNode),
|
||||
SourceInstanceId = NullIfEmpty(dto.SourceInstanceId),
|
||||
SourceScript = NullIfEmpty(dto.SourceScript),
|
||||
Actor = NullIfEmpty(dto.Actor),
|
||||
Target = NullIfEmpty(dto.Target),
|
||||
Status = Enum.Parse<AuditStatus>(dto.Status),
|
||||
HttpStatus = dto.HttpStatus,
|
||||
DurationMs = dto.DurationMs,
|
||||
ErrorMessage = NullIfEmpty(dto.ErrorMessage),
|
||||
ErrorDetail = NullIfEmpty(dto.ErrorDetail),
|
||||
RequestSummary = NullIfEmpty(dto.RequestSummary),
|
||||
ResponseSummary = NullIfEmpty(dto.ResponseSummary),
|
||||
PayloadTruncated = dto.PayloadTruncated,
|
||||
Extra = NullIfEmpty(dto.Extra),
|
||||
ForwardState = null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? NullIfEmpty(string? value) =>
|
||||
string.IsNullOrEmpty(value) ? null : value;
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
using Akka.Actor;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction over the site-side stream subscription mechanism.
|
||||
/// SiteStreamManager in the SiteRuntime project implements this interface;
|
||||
/// the gRPC server depends on it without referencing SiteRuntime directly.
|
||||
/// </summary>
|
||||
public interface ISiteStreamSubscriber
|
||||
{
|
||||
/// <summary>
|
||||
/// Subscribes an actor to receive filtered stream events for a specific instance.
|
||||
/// </summary>
|
||||
/// <param name="instanceName">The unique name of the instance whose events to subscribe to.</param>
|
||||
/// <param name="subscriber">The actor reference that will receive stream event messages.</param>
|
||||
/// <returns>A subscription ID that can be used for unsubscription.</returns>
|
||||
string Subscribe(string instanceName, IActorRef subscriber);
|
||||
|
||||
/// <summary>
|
||||
/// Removes all subscriptions for the given actor.
|
||||
/// </summary>
|
||||
/// <param name="subscriber">The actor reference whose subscriptions should be removed.</param>
|
||||
void RemoveSubscriber(IActorRef subscriber);
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Canonical bridge for Site Call Audit (#22) operational rows between the
|
||||
/// wire-format <see cref="SiteCallOperationalDto"/> exchanged on the
|
||||
/// <c>CachedCallTelemetry</c> packet and the in-process <see cref="SiteCall"/>
|
||||
/// persistence entity central writes into the <c>SiteCalls</c> table.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This mapper lives in <c>ZB.MOM.WW.ScadaBridge.Communication</c> (which owns the generated
|
||||
/// <see cref="SiteCallOperationalDto"/> and references <c>Commons</c> for
|
||||
/// <see cref="SiteCall"/>) so both <c>SiteStreamGrpcServer</c> and
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> can share one implementation without the
|
||||
/// project-reference cycle that would result from hosting it in
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> (AuditLog → Communication, never the reverse).
|
||||
/// Mirrors the sibling <see cref="AuditEventDtoMapper"/>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Only the DTO→entity direction is provided: nothing in the system maps a
|
||||
/// <see cref="SiteCall"/> back onto the wire (sites emit the operational state
|
||||
/// from <c>SiteCallOperational</c>, never from the central <see cref="SiteCall"/>
|
||||
/// entity), so an entity→DTO method would be dead code.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// String nullability convention: proto3 scalar strings cannot be absent, so the
|
||||
/// optional <see cref="SiteCall.LastError"/> rehydrates from an empty string back
|
||||
/// to null. The optional <c>HttpStatus</c> and <c>TerminalAtUtc</c> use proto
|
||||
/// wrappers so they preserve true null semantics.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static class SiteCallDtoMapper
|
||||
{
|
||||
/// <summary>
|
||||
/// Reconstructs a <see cref="SiteCall"/> persistence entity from its
|
||||
/// wire-format DTO. An empty <c>LastError</c> rehydrates as null; absent
|
||||
/// <c>HttpStatus</c>/<c>TerminalAtUtc</c> wrappers stay null.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <see cref="SiteCall.IngestedAtUtc"/> is stamped here as a placeholder
|
||||
/// (<see cref="DateTime.UtcNow"/>); the central ingest actor overwrites it
|
||||
/// inside the dual-write transaction so the AuditLog and SiteCalls rows
|
||||
/// share one instant. The value sent on the wire is informational only.
|
||||
/// </remarks>
|
||||
/// <param name="dto">The wire-format site call DTO to map.</param>
|
||||
public static SiteCall FromDto(SiteCallOperationalDto dto)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(dto);
|
||||
|
||||
return new SiteCall
|
||||
{
|
||||
TrackedOperationId = TrackedOperationId.Parse(dto.TrackedOperationId),
|
||||
Channel = dto.Channel,
|
||||
Target = dto.Target,
|
||||
SourceSite = dto.SourceSite,
|
||||
SourceNode = string.IsNullOrEmpty(dto.SourceNode) ? null : dto.SourceNode,
|
||||
Status = dto.Status,
|
||||
RetryCount = dto.RetryCount,
|
||||
LastError = string.IsNullOrEmpty(dto.LastError) ? null : dto.LastError,
|
||||
HttpStatus = dto.HttpStatus,
|
||||
CreatedAtUtc = DateTime.SpecifyKind(dto.CreatedAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
UpdatedAtUtc = DateTime.SpecifyKind(dto.UpdatedAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
TerminalAtUtc = dto.TerminalAtUtc is null
|
||||
? null
|
||||
: DateTime.SpecifyKind(dto.TerminalAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
IngestedAtUtc = DateTime.UtcNow, // overwritten by AuditLogIngestActor
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,316 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Grpc.Core;
|
||||
using Grpc.Net.Client;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Streaming;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site gRPC client that manages streaming subscriptions to a site's
|
||||
/// SiteStreamGrpcServer. The central-side DebugStreamBridgeActor uses this
|
||||
/// to open server-streaming calls for individual instances.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
|
||||
{
|
||||
private readonly GrpcChannel? _channel;
|
||||
private readonly SiteStreamService.SiteStreamServiceClient? _client;
|
||||
private readonly ILogger? _logger;
|
||||
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
|
||||
|
||||
/// <summary>
|
||||
/// The gRPC endpoint (site node address) this client is bound to. The
|
||||
/// <see cref="SiteStreamGrpcClientFactory"/> compares this against the requested
|
||||
/// endpoint so a NodeA→NodeB failover flip (or a site address edit) is honoured
|
||||
/// rather than served stale from cache.
|
||||
/// </summary>
|
||||
public virtual string Endpoint { get; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
|
||||
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
|
||||
/// </summary>
|
||||
internal TimeSpan KeepAlivePingDelay { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The HTTP/2 keepalive ping timeout actually applied to this client's channel.
|
||||
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
|
||||
/// </summary>
|
||||
internal TimeSpan KeepAlivePingTimeout { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a client with default communication options.
|
||||
/// </summary>
|
||||
/// <param name="endpoint">The gRPC endpoint address for the site.</param>
|
||||
/// <param name="logger">Logger for diagnostics and errors.</param>
|
||||
public SiteStreamGrpcClient(string endpoint, ILogger logger)
|
||||
: this(endpoint, logger, new CommunicationOptions())
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a client whose HTTP/2 keepalive is taken from <see cref="CommunicationOptions"/>
|
||||
/// rather than hard-coded, satisfying the design doc's "gRPC Connection Keepalive"
|
||||
/// section which states these values are configurable.
|
||||
/// </summary>
|
||||
/// <param name="endpoint">The gRPC endpoint address for the site.</param>
|
||||
/// <param name="logger">Logger for diagnostics and errors.</param>
|
||||
/// <param name="options">Communication options including keepalive settings.</param>
|
||||
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
|
||||
{
|
||||
Endpoint = endpoint;
|
||||
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
|
||||
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
|
||||
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
|
||||
{
|
||||
HttpHandler = new SocketsHttpHandler
|
||||
{
|
||||
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay,
|
||||
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout,
|
||||
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
|
||||
}
|
||||
});
|
||||
_client = new SiteStreamService.SiteStreamServiceClient(_channel);
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Protected constructor for unit testing without a real gRPC channel.
|
||||
/// Allows subclassing for mock implementations.
|
||||
/// </summary>
|
||||
protected SiteStreamGrpcClient()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Protected constructor for unit testing — records the endpoint without
|
||||
/// opening a real gRPC channel, so endpoint-aware factory behaviour can be
|
||||
/// exercised by test doubles.
|
||||
/// </summary>
|
||||
/// <param name="endpoint">The gRPC endpoint address for the site.</param>
|
||||
protected SiteStreamGrpcClient(string endpoint)
|
||||
{
|
||||
Endpoint = endpoint;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a test-only instance that has no gRPC channel. Used to test
|
||||
/// Unsubscribe and Dispose behavior without needing a real endpoint.
|
||||
/// </summary>
|
||||
internal static SiteStreamGrpcClient CreateForTesting() => new();
|
||||
|
||||
/// <summary>
|
||||
/// Registers a CancellationTokenSource for a correlation ID. Test-only.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Unique identifier for the subscription.</param>
|
||||
/// <param name="cts">CancellationTokenSource for managing the subscription lifecycle.</param>
|
||||
internal void AddSubscriptionForTesting(string correlationId, CancellationTokenSource cts)
|
||||
{
|
||||
_subscriptions[correlationId] = cts;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a subscription's CancellationTokenSource for a correlation ID.
|
||||
/// If an entry already exists for that correlation ID (a reconnect race where two
|
||||
/// <see cref="SubscribeAsync"/> calls briefly share an ID), the prior CTS is
|
||||
/// cancelled and disposed so it cannot leak. Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Unique identifier for the subscription.</param>
|
||||
/// <param name="cts">CancellationTokenSource for managing the subscription lifecycle.</param>
|
||||
internal void RegisterSubscription(string correlationId, CancellationTokenSource cts)
|
||||
{
|
||||
if (_subscriptions.TryGetValue(correlationId, out var prior) && !ReferenceEquals(prior, cts))
|
||||
{
|
||||
prior.Cancel();
|
||||
prior.Dispose();
|
||||
}
|
||||
_subscriptions[correlationId] = cts;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the subscription entry for a correlation ID only if the stored CTS is
|
||||
/// exactly the one supplied. A racing replacement stream may already own the slot,
|
||||
/// in which case this is a no-op. Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Unique identifier for the subscription.</param>
|
||||
/// <param name="cts">CancellationTokenSource to match before removing.</param>
|
||||
internal void RemoveSubscription(string correlationId, CancellationTokenSource cts)
|
||||
{
|
||||
_subscriptions.TryRemove(new KeyValuePair<string, CancellationTokenSource>(correlationId, cts));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Opens a server-streaming subscription for a specific instance.
|
||||
/// This is a long-running async method; the caller launches it as a background task.
|
||||
/// The <paramref name="onEvent"/> callback delivers domain events, and
|
||||
/// <paramref name="onError"/> lets the caller handle reconnection.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Unique identifier for this subscription.</param>
|
||||
/// <param name="instanceUniqueName">Unique name of the instance to subscribe to.</param>
|
||||
/// <param name="onEvent">Callback invoked for each domain event received from the stream.</param>
|
||||
/// <param name="onError">Callback invoked when the subscription encounters an error.</param>
|
||||
/// <param name="ct">Cancellation token to stop the subscription.</param>
|
||||
public virtual async Task SubscribeAsync(
|
||||
string correlationId,
|
||||
string instanceUniqueName,
|
||||
Action<object> onEvent,
|
||||
Action<Exception> onError,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_client is null)
|
||||
throw new InvalidOperationException("Cannot subscribe on a test-only client.");
|
||||
|
||||
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
RegisterSubscription(correlationId, cts);
|
||||
|
||||
var request = new InstanceStreamRequest
|
||||
{
|
||||
CorrelationId = correlationId,
|
||||
InstanceUniqueName = instanceUniqueName
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
using var call = _client.SubscribeInstance(request, cancellationToken: cts.Token);
|
||||
|
||||
await foreach (var evt in call.ResponseStream.ReadAllAsync(cts.Token))
|
||||
{
|
||||
var domainEvent = ConvertToDomainEvent(evt);
|
||||
if (domainEvent != null)
|
||||
onEvent(domainEvent);
|
||||
}
|
||||
}
|
||||
catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled)
|
||||
{
|
||||
// Normal cancellation — not an error
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
onError(ex);
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Remove only our own entry -- a racing reconnect may already own the slot.
|
||||
RemoveSubscription(correlationId, cts);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancels an active subscription by correlation ID.
|
||||
/// </summary>
|
||||
/// <param name="correlationId">Unique identifier of the subscription to cancel.</param>
|
||||
public virtual void Unsubscribe(string correlationId)
|
||||
{
|
||||
if (_subscriptions.TryRemove(correlationId, out var cts))
|
||||
{
|
||||
cts.Cancel();
|
||||
cts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a proto SiteStreamEvent to the corresponding domain message.
|
||||
/// Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="evt">The protobuf site stream event to convert.</param>
|
||||
/// <returns>The converted domain event, or null if the event type is not recognized.</returns>
|
||||
internal static object? ConvertToDomainEvent(SiteStreamEvent evt) => evt.EventCase switch
|
||||
{
|
||||
SiteStreamEvent.EventOneofCase.AttributeChanged => new AttributeValueChanged(
|
||||
evt.AttributeChanged.InstanceUniqueName,
|
||||
evt.AttributeChanged.AttributePath,
|
||||
evt.AttributeChanged.AttributeName,
|
||||
evt.AttributeChanged.Value,
|
||||
MapQuality(evt.AttributeChanged.Quality),
|
||||
evt.AttributeChanged.Timestamp.ToDateTimeOffset()),
|
||||
SiteStreamEvent.EventOneofCase.AlarmChanged => new AlarmStateChanged(
|
||||
evt.AlarmChanged.InstanceUniqueName,
|
||||
evt.AlarmChanged.AlarmName,
|
||||
MapAlarmState(evt.AlarmChanged.State),
|
||||
evt.AlarmChanged.Priority,
|
||||
evt.AlarmChanged.Timestamp.ToDateTimeOffset())
|
||||
{
|
||||
Level = MapAlarmLevel(evt.AlarmChanged.Level),
|
||||
Message = evt.AlarmChanged.Message ?? string.Empty
|
||||
},
|
||||
_ => null
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Maps proto Quality enum to domain string. Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="quality">The protobuf quality value to map.</param>
|
||||
/// <returns>The mapped quality as a string ("Good", "Uncertain", "Bad", or "Unknown").</returns>
|
||||
internal static string MapQuality(Quality quality) => quality switch
|
||||
{
|
||||
Quality.Good => "Good",
|
||||
Quality.Uncertain => "Uncertain",
|
||||
Quality.Bad => "Bad",
|
||||
_ => "Unknown"
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Maps proto AlarmStateEnum to domain AlarmState. Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="state">The protobuf alarm state to map.</param>
|
||||
/// <returns>The mapped domain alarm state.</returns>
|
||||
internal static AlarmState MapAlarmState(AlarmStateEnum state) => state switch
|
||||
{
|
||||
AlarmStateEnum.AlarmStateNormal => AlarmState.Normal,
|
||||
AlarmStateEnum.AlarmStateActive => AlarmState.Active,
|
||||
_ => AlarmState.Normal
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Maps proto AlarmLevelEnum to domain AlarmLevel. Internal for testability.
|
||||
/// </summary>
|
||||
/// <param name="level">The protobuf alarm level to map.</param>
|
||||
/// <returns>The mapped domain alarm level.</returns>
|
||||
internal static AlarmLevel MapAlarmLevel(AlarmLevelEnum level) => level switch
|
||||
{
|
||||
AlarmLevelEnum.AlarmLevelLow => AlarmLevel.Low,
|
||||
AlarmLevelEnum.AlarmLevelLowLow => AlarmLevel.LowLow,
|
||||
AlarmLevelEnum.AlarmLevelHigh => AlarmLevel.High,
|
||||
AlarmLevelEnum.AlarmLevelHighHigh => AlarmLevel.HighHigh,
|
||||
_ => AlarmLevel.None
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Releases all subscription CancellationTokenSources and the underlying
|
||||
/// gRPC channel. All teardown here is synchronous (CTS disposal and
|
||||
/// <see cref="GrpcChannel.Dispose"/>), so a synchronous <see cref="Dispose"/>
|
||||
/// can release everything without sync-over-async blocking.
|
||||
/// </summary>
|
||||
private void ReleaseResources()
|
||||
{
|
||||
foreach (var cts in _subscriptions.Values)
|
||||
{
|
||||
cts.Cancel();
|
||||
cts.Dispose();
|
||||
}
|
||||
_subscriptions.Clear();
|
||||
|
||||
_channel?.Dispose();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Asynchronously disposes of the gRPC client and all subscriptions.
|
||||
/// </summary>
|
||||
public virtual ValueTask DisposeAsync()
|
||||
{
|
||||
ReleaseResources();
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Synchronous disposal. All resources held by this client are released
|
||||
/// synchronously, so callers (e.g. <see cref="SiteStreamGrpcClientFactory.Dispose"/>)
|
||||
/// need not block on the async disposal path.
|
||||
/// </summary>
|
||||
public virtual void Dispose()
|
||||
{
|
||||
ReleaseResources();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Caches one <see cref="SiteStreamGrpcClient"/> per site identifier.
|
||||
/// The DebugStreamBridgeActor uses this factory to obtain (or create) a
|
||||
/// gRPC client for a given site before opening a streaming subscription.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, SiteStreamGrpcClient> _clients = new();
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly CommunicationOptions _options;
|
||||
|
||||
/// <summary>
|
||||
/// Test/default constructor — uses default <see cref="CommunicationOptions"/>.
|
||||
/// </summary>
|
||||
/// <param name="loggerFactory">Logger factory passed to created clients.</param>
|
||||
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory)
|
||||
: this(loggerFactory, Options.Create(new CommunicationOptions()))
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DI constructor — flows <see cref="CommunicationOptions"/> into every created
|
||||
/// <see cref="SiteStreamGrpcClient"/> so the configured gRPC keepalive settings
|
||||
/// are applied rather than hard-coded defaults.
|
||||
/// </summary>
|
||||
/// <param name="loggerFactory">Logger factory passed to created clients.</param>
|
||||
/// <param name="options">Communication options applied to each created client.</param>
|
||||
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory, IOptions<CommunicationOptions> options)
|
||||
{
|
||||
_loggerFactory = loggerFactory;
|
||||
_options = options.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the cached client for the site, or creates a new one. If a client is
|
||||
/// already cached but bound to a *different* <paramref name="grpcEndpoint"/> — the
|
||||
/// NodeA→NodeB failover flip, or a site whose gRPC address was edited — the stale
|
||||
/// client is disposed and replaced with one bound to the requested endpoint.
|
||||
/// Communication-012/013: keying purely by site identifier and ignoring the
|
||||
/// endpoint on a cache hit defeated debug-stream node failover and meant a
|
||||
/// corrected gRPC address never took effect without a central restart.
|
||||
/// </summary>
|
||||
/// <param name="siteIdentifier">Unique site identifier used as the cache key.</param>
|
||||
/// <param name="grpcEndpoint">gRPC endpoint the returned client must be bound to.</param>
|
||||
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
|
||||
{
|
||||
// Fast path: a client is cached and already bound to the requested endpoint.
|
||||
if (_clients.TryGetValue(siteIdentifier, out var existing) &&
|
||||
string.Equals(existing.Endpoint, grpcEndpoint, StringComparison.Ordinal))
|
||||
{
|
||||
return existing;
|
||||
}
|
||||
|
||||
// Either no client is cached, or the cached one is bound to a different
|
||||
// endpoint. AddOrUpdate atomically installs a client for the requested
|
||||
// endpoint; the prior (stale) client, if any, is disposed afterwards.
|
||||
SiteStreamGrpcClient? stale = null;
|
||||
var client = _clients.AddOrUpdate(
|
||||
siteIdentifier,
|
||||
_ => CreateClient(grpcEndpoint),
|
||||
(_, current) =>
|
||||
{
|
||||
if (string.Equals(current.Endpoint, grpcEndpoint, StringComparison.Ordinal))
|
||||
return current;
|
||||
stale = current;
|
||||
return CreateClient(grpcEndpoint);
|
||||
});
|
||||
|
||||
stale?.Dispose();
|
||||
return client;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a single <see cref="SiteStreamGrpcClient"/>. Overridable so tests
|
||||
/// can substitute a tracking client while still exercising the factory's real
|
||||
/// caching and disposal machinery.
|
||||
/// </summary>
|
||||
/// <param name="grpcEndpoint">gRPC endpoint the new client will connect to.</param>
|
||||
protected virtual SiteStreamGrpcClient CreateClient(string grpcEndpoint)
|
||||
{
|
||||
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
|
||||
return new SiteStreamGrpcClient(grpcEndpoint, logger, _options);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes and disposes the client for the given site. Site *address changes* are
|
||||
/// now handled transparently by <see cref="GetOrCreate"/> (it disposes and recreates
|
||||
/// a client whose endpoint no longer matches). This method remains the disposal
|
||||
/// path for full site *removal* — call it when a site record is deleted so its
|
||||
/// cached gRPC client does not linger for the life of the process.
|
||||
/// </summary>
|
||||
/// <param name="siteIdentifier">Unique site identifier whose client should be removed.</param>
|
||||
public async Task RemoveSiteAsync(string siteIdentifier)
|
||||
{
|
||||
if (_clients.TryRemove(siteIdentifier, out var client))
|
||||
{
|
||||
await client.DisposeAsync();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Asynchronously disposes all cached clients and clears the cache.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
foreach (var client in _clients.Values)
|
||||
{
|
||||
await client.DisposeAsync();
|
||||
}
|
||||
_clients.Clear();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Synchronous disposal. Communication-007: this used to block on
|
||||
/// <c>DisposeAsync().AsTask().GetAwaiter().GetResult()</c> (sync-over-async,
|
||||
/// a stall/deadlock risk during host shutdown). Each
|
||||
/// <see cref="SiteStreamGrpcClient"/> releases all of its resources
|
||||
/// synchronously, so we dispose them directly with no async path.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
foreach (var client in _clients.Values)
|
||||
{
|
||||
client.Dispose();
|
||||
}
|
||||
_clients.Clear();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,543 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Threading.Channels;
|
||||
using Akka.Actor;
|
||||
using Grpc.Core;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using GrpcStatus = Grpc.Core.Status;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// gRPC service that accepts instance stream subscriptions from central nodes.
|
||||
/// Creates a StreamRelayActor per subscription to bridge Akka domain events
|
||||
/// through a Channel<T> to the gRPC response stream.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
{
|
||||
private readonly ISiteStreamSubscriber _streamSubscriber;
|
||||
private ActorSystem? _actorSystem;
|
||||
private readonly ILogger<SiteStreamGrpcServer> _logger;
|
||||
private readonly ConcurrentDictionary<string, StreamEntry> _activeStreams = new();
|
||||
private readonly int _maxConcurrentStreams;
|
||||
private readonly TimeSpan _maxStreamLifetime;
|
||||
private volatile bool _ready;
|
||||
// Host-017 / REQ-HOST-7: flipped by CancelAllStreams() when the host enters
|
||||
// CoordinatedShutdown so SubscribeInstance refuses new streams with
|
||||
// Unavailable before the actor system tears down. Strictly monotonic — once
|
||||
// true, never reset (the server is single-lifetime per host).
|
||||
private volatile bool _shuttingDown;
|
||||
private long _actorCounter;
|
||||
// Audit Log (#23 M2): central-side ingest actor proxy. Set by the host
|
||||
// after the cluster singleton starts (see Bundle E wiring). When null the
|
||||
// IngestAuditEvents RPC replies with an empty IngestAck so sites can
|
||||
// safely retry — wiring-incomplete is treated as transient, never fatal.
|
||||
private IActorRef? _auditIngestActor;
|
||||
// Per Bundle D's brief — Ask timeout is 30 s. The ingest actor's repo
|
||||
// calls are sub-100 ms in steady state; a generous timeout absorbs a slow
|
||||
// MSSQL connection without surfacing as a gRPC failure on a healthy site.
|
||||
private static readonly TimeSpan AuditIngestAskTimeout = TimeSpan.FromSeconds(30);
|
||||
// Audit Log (#23 M6): site-local queue handed in by AkkaHostedService on
|
||||
// site roles so the central reconciliation puller's PullAuditEvents RPC
|
||||
// can read Pending/Forwarded rows. Null when not wired (e.g. central-only
|
||||
// host or test composing the server in isolation) — the handler treats
|
||||
// the missing queue as "nothing to ship" and returns an empty response so
|
||||
// central retries on its next reconciliation cycle.
|
||||
private ISiteAuditQueue? _siteAuditQueue;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only constructor — kept <c>internal</c> so the DI container sees a
|
||||
/// single public constructor and is not faced with an ambiguous choice.
|
||||
/// </summary>
|
||||
/// <param name="streamSubscriber">The stream subscriber for managing subscriptions.</param>
|
||||
/// <param name="logger">The logger instance.</param>
|
||||
/// <param name="maxConcurrentStreams">The maximum concurrent streams (default 100).</param>
|
||||
internal SiteStreamGrpcServer(
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
int maxConcurrentStreams = 100)
|
||||
: this(streamSubscriber, logger, maxConcurrentStreams, TimeSpan.FromHours(4))
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DI constructor — binds <see cref="CommunicationOptions.GrpcMaxConcurrentStreams"/>
|
||||
/// and <see cref="CommunicationOptions.GrpcMaxStreamLifetime"/> so the documented
|
||||
/// concurrency limit and the 4-hour zombie-stream session timeout are honoured
|
||||
/// rather than hard-coded.
|
||||
/// </summary>
|
||||
/// <param name="streamSubscriber">The stream subscriber for managing subscriptions.</param>
|
||||
/// <param name="logger">The logger instance.</param>
|
||||
/// <param name="options">Communication options containing stream limits and timeouts.</param>
|
||||
public SiteStreamGrpcServer(
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
IOptions<CommunicationOptions> options)
|
||||
: this(streamSubscriber, logger,
|
||||
options.Value.GrpcMaxConcurrentStreams,
|
||||
options.Value.GrpcMaxStreamLifetime)
|
||||
{
|
||||
}
|
||||
|
||||
private SiteStreamGrpcServer(
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
int maxConcurrentStreams,
|
||||
TimeSpan maxStreamLifetime)
|
||||
{
|
||||
_streamSubscriber = streamSubscriber;
|
||||
_logger = logger;
|
||||
_maxConcurrentStreams = maxConcurrentStreams;
|
||||
_maxStreamLifetime = maxStreamLifetime;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the server as ready to accept subscriptions and injects the ActorSystem.
|
||||
/// Called after the site runtime actor system is fully initialized.
|
||||
/// The ActorSystem is set here rather than via the constructor so that
|
||||
/// the gRPC server can be created by DI before the actor system exists.
|
||||
/// </summary>
|
||||
/// <param name="actorSystem">The initialized Akka actor system.</param>
|
||||
public void SetReady(ActorSystem actorSystem)
|
||||
{
|
||||
_actorSystem = actorSystem;
|
||||
_ready = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Hands the central-side <c>AuditLogIngestActor</c> proxy to the gRPC
|
||||
/// server so the <see cref="IngestAuditEvents"/> RPC can route incoming
|
||||
/// site batches. Audit Log (#23) M2 wiring point — mirrors the way
|
||||
/// <c>CommunicationService.SetNotificationOutbox</c> takes the Notification
|
||||
/// Outbox singleton proxy. Bundle E supplies the actor after the cluster
|
||||
/// singleton starts.
|
||||
/// </summary>
|
||||
/// <param name="proxy">The audit log ingest actor proxy.</param>
|
||||
public void SetAuditIngestActor(IActorRef proxy)
|
||||
{
|
||||
_auditIngestActor = proxy;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Hands the site-local <see cref="ISiteAuditQueue"/> (the same
|
||||
/// <c>SqliteAuditWriter</c> singleton that backs <see cref="IAuditWriter"/>
|
||||
/// on the script thread) to the gRPC server so the M6
|
||||
/// <see cref="PullAuditEvents"/> RPC can serve central's reconciliation
|
||||
/// pulls. Mirrors <see cref="SetAuditIngestActor"/>: wired post-construction
|
||||
/// because the queue and the gRPC server are both DI singletons brought up
|
||||
/// in independent orders on site startup.
|
||||
/// </summary>
|
||||
/// <param name="queue">The site audit queue for serving reconciliation pulls.</param>
|
||||
public void SetSiteAuditQueue(ISiteAuditQueue queue)
|
||||
{
|
||||
_siteAuditQueue = queue;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Host-017 / REQ-HOST-7: signals the gRPC server to begin its part of the
|
||||
/// site shutdown sequence — refuse new <see cref="SubscribeInstance"/>
|
||||
/// streams with <see cref="StatusCode.Unavailable"/> and cancel every
|
||||
/// active stream so its <c>await foreach</c> observes
|
||||
/// <see cref="OperationCanceledException"/> and the response stream
|
||||
/// completes with <c>Cancelled</c> on the client. Idempotent — safe to call
|
||||
/// more than once. Invoked from the site host's
|
||||
/// <c>IHostApplicationLifetime.ApplicationStopping</c> callback BEFORE
|
||||
/// Akka's <c>CoordinatedShutdown</c> runs, so in-flight clients get a
|
||||
/// clean cancellation they can reconnect on rather than a silent stream
|
||||
/// that only times out via gRPC keepalive.
|
||||
/// </summary>
|
||||
public void CancelAllStreams()
|
||||
{
|
||||
_shuttingDown = true;
|
||||
foreach (var entry in _activeStreams.Values)
|
||||
{
|
||||
try
|
||||
{
|
||||
entry.Cts.Cancel();
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Already cleaned up by its own finally — nothing to do.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Host-017: exposed for test assertions on the shutdown state.
|
||||
/// </summary>
|
||||
internal bool IsShuttingDown => _shuttingDown;
|
||||
|
||||
/// <summary>
|
||||
/// Number of currently active streaming subscriptions. Exposed for diagnostics.
|
||||
/// </summary>
|
||||
public int ActiveStreamCount => _activeStreams.Count;
|
||||
|
||||
/// <summary>Effective max concurrent stream limit. Exposed for tests.</summary>
|
||||
internal int MaxConcurrentStreams => _maxConcurrentStreams;
|
||||
|
||||
/// <summary>Effective per-stream session lifetime. Exposed for tests.</summary>
|
||||
internal TimeSpan MaxStreamLifetime => _maxStreamLifetime;
|
||||
|
||||
/// <inheritdoc />
|
||||
public override async Task SubscribeInstance(
|
||||
InstanceStreamRequest request,
|
||||
IServerStreamWriter<SiteStreamEvent> responseStream,
|
||||
ServerCallContext context)
|
||||
{
|
||||
if (!_ready)
|
||||
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server not ready"));
|
||||
|
||||
// Host-017 / REQ-HOST-7: refuse new subscriptions during shutdown so
|
||||
// CoordinatedShutdown can quiesce without racing fresh streams.
|
||||
if (_shuttingDown)
|
||||
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server shutting down"));
|
||||
|
||||
// Communication-014: correlation_id arrives off the wire on a public gRPC
|
||||
// endpoint and is used (below) to compose an Akka actor name. Akka actor names
|
||||
// have a restricted character set — a id containing '/', whitespace, or other
|
||||
// disallowed characters would make ActorOf throw InvalidActorNameException,
|
||||
// escaping as an unhandled RPC fault. Reject unsafe ids cleanly up front.
|
||||
if (string.IsNullOrEmpty(request.CorrelationId) ||
|
||||
!ActorPath.IsValidPathElement(request.CorrelationId))
|
||||
{
|
||||
throw new RpcException(new GrpcStatus(
|
||||
StatusCode.InvalidArgument, "correlation_id is missing or not a valid identifier"));
|
||||
}
|
||||
|
||||
// Duplicate prevention -- cancel existing stream for this correlationId
|
||||
if (_activeStreams.TryRemove(request.CorrelationId, out var existingEntry))
|
||||
{
|
||||
existingEntry.Cts.Cancel();
|
||||
existingEntry.Cts.Dispose();
|
||||
}
|
||||
|
||||
// Check max concurrent streams after duplicate removal
|
||||
if (_activeStreams.Count >= _maxConcurrentStreams)
|
||||
throw new RpcException(new GrpcStatus(StatusCode.ResourceExhausted, "Max concurrent streams reached"));
|
||||
|
||||
using var streamCts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
|
||||
// Session timeout (design doc "gRPC Connection Keepalive": 4-hour third layer
|
||||
// of dead-client detection) — forces a long-lived zombie stream to terminate
|
||||
// even if keepalive PINGs never detect the loss.
|
||||
if (_maxStreamLifetime > TimeSpan.Zero && _maxStreamLifetime != Timeout.InfiniteTimeSpan)
|
||||
streamCts.CancelAfter(_maxStreamLifetime);
|
||||
var entry = new StreamEntry(streamCts);
|
||||
_activeStreams[request.CorrelationId] = entry;
|
||||
|
||||
var channel = Channel.CreateBounded<SiteStreamEvent>(
|
||||
new BoundedChannelOptions(1000) { FullMode = BoundedChannelFullMode.DropOldest });
|
||||
|
||||
var actorSeq = Interlocked.Increment(ref _actorCounter);
|
||||
var relayActor = _actorSystem!.ActorOf(
|
||||
Props.Create(typeof(Actors.StreamRelayActor), request.CorrelationId, channel.Writer),
|
||||
$"stream-relay-{request.CorrelationId}-{actorSeq}");
|
||||
|
||||
// Communication-021: the previous code called _streamSubscriber.Subscribe
|
||||
// OUTSIDE the try block that owns relay-actor cleanup. If Subscribe threw
|
||||
// (stale instance name, index lookup fault, site runtime shutting down),
|
||||
// the freshly-created relay actor, the _activeStreams entry, the
|
||||
// StreamEntry.Cts, and the Channel<SiteStreamEvent> all leaked because the
|
||||
// finally never ran. Wrap Subscribe in its own try so any throw deterministically
|
||||
// stops the relay actor, removes the activeStreams entry, and completes the
|
||||
// channel before the RpcException escapes to the caller.
|
||||
string subscriptionId;
|
||||
try
|
||||
{
|
||||
subscriptionId = _streamSubscriber.Subscribe(request.InstanceUniqueName, relayActor);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Subscribe failed for {Instance} (correlation {CorrelationId}); cleaning up relay actor.",
|
||||
request.InstanceUniqueName, request.CorrelationId);
|
||||
_actorSystem!.Stop(relayActor);
|
||||
channel.Writer.TryComplete();
|
||||
_activeStreams.TryRemove(
|
||||
new KeyValuePair<string, StreamEntry>(request.CorrelationId, entry));
|
||||
throw;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Stream {CorrelationId} started for {Instance} (subscription {SubscriptionId})",
|
||||
request.CorrelationId, request.InstanceUniqueName, subscriptionId);
|
||||
|
||||
try
|
||||
{
|
||||
await foreach (var evt in channel.Reader.ReadAllAsync(streamCts.Token))
|
||||
{
|
||||
await responseStream.WriteAsync(evt, streamCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Normal cancellation (client disconnect or duplicate replacement)
|
||||
}
|
||||
finally
|
||||
{
|
||||
_streamSubscriber.RemoveSubscriber(relayActor);
|
||||
_actorSystem!.Stop(relayActor);
|
||||
channel.Writer.TryComplete();
|
||||
|
||||
// Only remove our own entry -- a replacement stream may have already taken the slot
|
||||
_activeStreams.TryRemove(
|
||||
new KeyValuePair<string, StreamEntry>(request.CorrelationId, entry));
|
||||
|
||||
_logger.LogInformation(
|
||||
"Stream {CorrelationId} for {Instance} ended",
|
||||
request.CorrelationId, request.InstanceUniqueName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M2 site→central push RPC. Decodes a site batch into
|
||||
/// <see cref="AuditEvent"/> rows, Asks the central <c>AuditLogIngestActor</c>
|
||||
/// proxy to persist them, and echoes the accepted EventIds back so the site
|
||||
/// can flip its local rows to <c>Forwarded</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The DTO→entity conversion uses the shared <see cref="AuditEventDtoMapper"/>
|
||||
/// (hosted in <c>ZB.MOM.WW.ScadaBridge.Communication</c> so both this server and
|
||||
/// <c>ZB.MOM.WW.ScadaBridge.AuditLog</c> share one implementation without a
|
||||
/// project-reference cycle).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// When <see cref="_auditIngestActor"/> is not yet wired (host startup
|
||||
/// race window), the RPC returns an empty <see cref="IngestAck"/> rather
|
||||
/// than failing — the site treats the missing ack as a transient outcome
|
||||
/// and retries on the next drain, which is the desired idempotent
|
||||
/// behaviour.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <inheritdoc />
|
||||
/// <param name="request">The audit event batch to ingest.</param>
|
||||
/// <param name="context">The server call context.</param>
|
||||
public override async Task<IngestAck> IngestAuditEvents(
|
||||
AuditEventBatch request,
|
||||
ServerCallContext context)
|
||||
{
|
||||
// Empty batch is a no-op; reply immediately so the client moves on.
|
||||
if (request.Events.Count == 0)
|
||||
{
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var actor = _auditIngestActor;
|
||||
if (actor is null)
|
||||
{
|
||||
// Wiring incomplete (host startup race). Sites treat an empty
|
||||
// ack as "nothing was acked, leave rows Pending, retry next
|
||||
// drain" — exactly the right behaviour during host bring-up.
|
||||
_logger.LogWarning(
|
||||
"IngestAuditEvents received {Count} events before SetAuditIngestActor was called; returning empty ack.",
|
||||
request.Events.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var entities = new List<AuditEvent>(request.Events.Count);
|
||||
foreach (var dto in request.Events)
|
||||
{
|
||||
entities.Add(AuditEventDtoMapper.FromDto(dto));
|
||||
}
|
||||
|
||||
var cmd = new IngestAuditEventsCommand(entities);
|
||||
IngestAuditEventsReply reply;
|
||||
try
|
||||
{
|
||||
reply = await actor.Ask<IngestAuditEventsReply>(
|
||||
cmd, AuditIngestAskTimeout, context.CancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Audit ingest is best-effort; failing this RPC at the gRPC layer
|
||||
// would surface as a transport error and force the site to retry
|
||||
// (which it would do anyway). Logging + an empty ack keeps the
|
||||
// semantics consistent with the "wiring incomplete" path above.
|
||||
_logger.LogError(ex,
|
||||
"AuditLogIngestActor Ask failed for batch of {Count} events; returning empty ack.",
|
||||
request.Events.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in reply.AcceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M3 site→central combined-telemetry push RPC. Decodes a
|
||||
/// batch of <see cref="CachedTelemetryPacket"/> entries into matched
|
||||
/// (AuditEvent, SiteCall) pairs, Asks the central <c>AuditLogIngestActor</c>
|
||||
/// proxy to persist them in dual-write transactions, and echoes the
|
||||
/// AuditEvent EventIds that committed back so the site can flip its local
|
||||
/// rows to <c>Forwarded</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Same wiring-incomplete fallback as <see cref="IngestAuditEvents"/>: when
|
||||
/// the actor proxy has not been set the RPC replies with an empty ack so
|
||||
/// sites treat the outcome as transient and retry, never a hard fault.
|
||||
/// </remarks>
|
||||
/// <inheritdoc />
|
||||
/// <param name="request">The cached telemetry batch to ingest.</param>
|
||||
/// <param name="context">The server call context.</param>
|
||||
public override async Task<IngestAck> IngestCachedTelemetry(
|
||||
CachedTelemetryBatch request,
|
||||
ServerCallContext context)
|
||||
{
|
||||
if (request.Packets.Count == 0)
|
||||
{
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var actor = _auditIngestActor;
|
||||
if (actor is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"IngestCachedTelemetry received {Count} packets before SetAuditIngestActor was called; returning empty ack.",
|
||||
request.Packets.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var entries = new List<CachedTelemetryEntry>(request.Packets.Count);
|
||||
foreach (var packet in request.Packets)
|
||||
{
|
||||
var auditEvent = AuditEventDtoMapper.FromDto(packet.AuditEvent);
|
||||
var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
|
||||
entries.Add(new CachedTelemetryEntry(auditEvent, siteCall));
|
||||
}
|
||||
|
||||
var cmd = new IngestCachedTelemetryCommand(entries);
|
||||
IngestCachedTelemetryReply reply;
|
||||
try
|
||||
{
|
||||
reply = await actor.Ask<IngestCachedTelemetryReply>(
|
||||
cmd, AuditIngestAskTimeout, context.CancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"AuditLogIngestActor Ask failed for combined telemetry batch of {Count} packets; returning empty ack.",
|
||||
request.Packets.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in reply.AcceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 reconciliation pull RPC. Central asks the site for any
|
||||
/// AuditLog rows whose <c>OccurredAtUtc >= since_utc</c> and whose
|
||||
/// <c>ForwardState</c> is still <c>Pending</c> or <c>Forwarded</c> (i.e. not
|
||||
/// yet confirmed reconciled), bounded by <c>batch_size</c>. The site responds
|
||||
/// with the rows AND flips them to
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AuditForwardState.Reconciled"/>
|
||||
/// AFTER serializing the response. The flip is best-effort — if it fails
|
||||
/// (e.g. SQLite disposed mid-call), rows stay Pending/Forwarded and central
|
||||
/// pulls them again on the next reconciliation cycle. Idempotent.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// When <see cref="_siteAuditQueue"/> is not wired (central-only host or a
|
||||
/// composition-root test exercising the server in isolation) the RPC returns
|
||||
/// an empty response — central treats that as "nothing to ship" and retries
|
||||
/// on its next cycle, which is the same self-healing semantics as the
|
||||
/// SetAuditIngestActor wiring race window.
|
||||
/// </remarks>
|
||||
/// <inheritdoc />
|
||||
/// <param name="request">The pull request with time bounds and batch size.</param>
|
||||
/// <param name="context">The server call context.</param>
|
||||
public override async Task<PullAuditEventsResponse> PullAuditEvents(
|
||||
PullAuditEventsRequest request,
|
||||
ServerCallContext context)
|
||||
{
|
||||
var queue = _siteAuditQueue;
|
||||
if (queue is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"PullAuditEvents invoked before SetSiteAuditQueue was called; returning empty response.");
|
||||
return new PullAuditEventsResponse();
|
||||
}
|
||||
|
||||
if (request.BatchSize <= 0)
|
||||
{
|
||||
// Mirrors the SubscribeInstance guard: reject malformed requests
|
||||
// cleanly with InvalidArgument so the caller doesn't see a generic
|
||||
// RpcException from the underlying SQLite parameter validation.
|
||||
throw new RpcException(new GrpcStatus(
|
||||
StatusCode.InvalidArgument, "batch_size must be > 0"));
|
||||
}
|
||||
|
||||
// sinceUtc defaults to DateTime.MinValue when the wrapper is absent —
|
||||
// i.e. "pull from the beginning of recorded history", which is the
|
||||
// intended behaviour for the very first reconciliation cycle.
|
||||
var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue;
|
||||
|
||||
IReadOnlyList<AuditEvent> events;
|
||||
try
|
||||
{
|
||||
events = await queue.ReadPendingSinceAsync(
|
||||
since, request.BatchSize, context.CancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"ReadPendingSinceAsync failed for since={Since} batch={Batch}; returning empty response.",
|
||||
since, request.BatchSize);
|
||||
return new PullAuditEventsResponse();
|
||||
}
|
||||
|
||||
var response = new PullAuditEventsResponse
|
||||
{
|
||||
// batch_size saturated → tell central to issue a follow-up pull
|
||||
// with an advanced cursor. The site doesn't compute the cursor —
|
||||
// central walks it forward from the last returned OccurredAtUtc.
|
||||
MoreAvailable = events.Count >= request.BatchSize,
|
||||
};
|
||||
foreach (var evt in events)
|
||||
{
|
||||
response.Events.Add(AuditEventDtoMapper.ToDto(evt));
|
||||
}
|
||||
|
||||
// Flip to Reconciled AFTER projecting the response so a fault below the
|
||||
// try/catch (mid-response, mid-flip) leaves the rows in Pending/Forwarded
|
||||
// and central pulls them again next cycle. The flip itself is
|
||||
// best-effort — its failure is a warning, not a fault, because central
|
||||
// will dedup on EventId on the next pull.
|
||||
var ids = new List<Guid>(events.Count);
|
||||
foreach (var evt in events)
|
||||
{
|
||||
ids.Add(evt.EventId);
|
||||
}
|
||||
|
||||
if (ids.Count > 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
await queue.MarkReconciledAsync(ids, context.CancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"MarkReconciledAsync failed after PullAuditEvents response of {Count} rows; rows stay Pending for retry.",
|
||||
ids.Count);
|
||||
}
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tracks a single active stream so cleanup only removes its own entry.
|
||||
/// </summary>
|
||||
private sealed record StreamEntry(CancellationTokenSource Cts);
|
||||
}
|
||||
Reference in New Issue
Block a user