fix(high-severity): close 9 of 10 open High findings across 8 modules
Comm-016: delete dead HandleConnectionStateChanged + _debugSubscriptions / _inProgressDeployments tracking + ConnectionStateChanged message record. Disconnect detection is owned by the transport layers (gRPC keepalive PING ~25s; Ask-timeout at CommunicationService). Updates the Component-Communication.md design doc to make that explicit. SnF-018: NotificationForwarder.DeliverAsync now discards a corrupt buffered payload (Warning log + return true) instead of returning false and parking the row — honoring the design's "notifications do not park" invariant. DM-018: reconciliation no longer force-sets Enabled, preserving an intentional Disabled state after central failover. ESG-018: DeliverBufferedAsync (both ExternalSystemClient + DatabaseGateway) catches JsonException and returns false, turning a corrupt buffered row into a parked operation instead of a retry-forever poison message. InboundAPI-022: register ActiveNodeGate as IActiveNodeGate in the Central DI branch so standby-node gating is actually wired up in production. NS-019: remove orphaned NotificationDeliveryService / INotificationDeliveryService / NotificationResult; central notification delivery now lives entirely in NotificationOutbox. SEL-016: normalise From/To filters to UTC before ISO-string compare so non-UTC DateTimeOffset clients no longer get spuriously excluded events. TE-017: include Description on attributes/alarms and a HashableConnections projection (protocol, endpoint JSON, failover count) in the revision hash and DiffService; staleness detection now catches description-only and connection-endpoint edits. Transport-001 and Transport-002 (also High) remain Open — they're being handled in a follow-up batch because both touch BundleImporter.cs and must serialise.
This commit is contained in:
@@ -1,32 +0,0 @@
|
||||
namespace ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for sending notifications.
|
||||
/// Implemented by NotificationService, consumed by ScriptRuntimeContext.
|
||||
/// </summary>
|
||||
public interface INotificationDeliveryService
|
||||
{
|
||||
/// <summary>
|
||||
/// Sends a notification to a named list. Transient failures go to S&F.
|
||||
/// Permanent failures returned to caller.
|
||||
/// </summary>
|
||||
/// <param name="listName">Name of the notification list to deliver to.</param>
|
||||
/// <param name="subject">Subject line of the notification.</param>
|
||||
/// <param name="message">Plain-text body of the notification.</param>
|
||||
/// <param name="originInstanceName">Optional name of the instance that triggered the send.</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the async operation.</param>
|
||||
Task<NotificationResult> SendAsync(
|
||||
string listName,
|
||||
string subject,
|
||||
string message,
|
||||
string? originInstanceName = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a notification send attempt.
|
||||
/// </summary>
|
||||
public record NotificationResult(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
bool WasBuffered = false);
|
||||
@@ -1,6 +0,0 @@
|
||||
namespace ScadaLink.Commons.Messages.Communication;
|
||||
|
||||
public record ConnectionStateChanged(
|
||||
string SiteId,
|
||||
bool IsConnected,
|
||||
DateTimeOffset Timestamp);
|
||||
@@ -60,17 +60,18 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, (IActorRef Client, ImmutableHashSet<string> ContactAddresses)> _siteClients = new();
|
||||
|
||||
/// <summary>
|
||||
/// Tracks active debug view subscriptions: correlationId → (siteId, subscriber).
|
||||
/// Used to kill debug streams on site disconnection (WP-5).
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, (string SiteId, IActorRef Subscriber)> _debugSubscriptions = new();
|
||||
|
||||
/// <summary>
|
||||
/// Tracks in-progress deployments: deploymentId → siteId.
|
||||
/// On central failover, in-progress deployments are treated as failed (WP-5).
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, string> _inProgressDeployments = new();
|
||||
// Communication-016: the previous _debugSubscriptions / _inProgressDeployments
|
||||
// dictionaries existed solely to support a documented "synchronous kill streams +
|
||||
// mark deployments failed on site disconnect" workflow triggered by
|
||||
// ConnectionStateChanged. No production code ever emitted that message — only
|
||||
// the unit test did — so the workflow was dead from end to end. Disconnect
|
||||
// detection is owned by the underlying transports: the gRPC keepalive PING
|
||||
// signals stream interruption in ~25s (handled by DebugStreamBridgeActor's own
|
||||
// reconnection logic), and an Ask round-trip for a deploy times out at the
|
||||
// CommunicationService layer (caller sees failure). The tracking dicts +
|
||||
// ConnectionStateChanged record + HandleConnectionStateChanged handler are
|
||||
// removed; see docs/requirements/Component-Communication.md "Connection
|
||||
// Failure Behavior" for the keepalive-based contract that survives.
|
||||
|
||||
private ICancelable? _refreshSchedule;
|
||||
|
||||
@@ -165,9 +166,6 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
Receive<SiteHealthReportReplica>(r => ProcessLocally(r.Report));
|
||||
Receive<SubscribeAck>(_ => { /* DistributedPubSub subscribe confirmation */ });
|
||||
|
||||
// Connection state changes
|
||||
Receive<ConnectionStateChanged>(HandleConnectionStateChanged);
|
||||
|
||||
// Route enveloped messages to sites
|
||||
Receive<SiteEnvelope>(HandleSiteEnvelope);
|
||||
|
||||
@@ -335,44 +333,10 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
private void HandleConnectionStateChanged(ConnectionStateChanged msg)
|
||||
{
|
||||
if (!msg.IsConnected)
|
||||
{
|
||||
_log.Warning("Site {0} disconnected at {1}", msg.SiteId, msg.Timestamp);
|
||||
|
||||
// WP-5: Kill active debug streams for the disconnected site
|
||||
var toRemove = _debugSubscriptions
|
||||
.Where(kvp => kvp.Value.SiteId == msg.SiteId)
|
||||
.ToList();
|
||||
|
||||
foreach (var kvp in toRemove)
|
||||
{
|
||||
_log.Info("Killing debug stream {0} for disconnected site {1}", kvp.Key, msg.SiteId);
|
||||
kvp.Value.Subscriber.Tell(new DebugStreamTerminated(msg.SiteId, kvp.Key));
|
||||
_debugSubscriptions.Remove(kvp.Key);
|
||||
}
|
||||
|
||||
// WP-5: Mark in-progress deployments as failed
|
||||
var failedDeployments = _inProgressDeployments
|
||||
.Where(kvp => kvp.Value == msg.SiteId)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
|
||||
foreach (var deploymentId in failedDeployments)
|
||||
{
|
||||
_log.Warning("Deployment {0} to site {1} treated as failed due to disconnection",
|
||||
deploymentId, msg.SiteId);
|
||||
_inProgressDeployments.Remove(deploymentId);
|
||||
}
|
||||
|
||||
// Note: Do NOT stop the ClusterClient — it handles reconnection internally
|
||||
}
|
||||
else
|
||||
{
|
||||
_log.Info("Site {0} connected at {1}", msg.SiteId, msg.Timestamp);
|
||||
}
|
||||
}
|
||||
// Communication-016: HandleConnectionStateChanged removed — no production
|
||||
// caller emitted ConnectionStateChanged, so the workflow ran only in tests.
|
||||
// Disconnect detection is owned by the transport layers (gRPC keepalive +
|
||||
// ClusterClient/Ask timeout).
|
||||
|
||||
private void HandleSiteEnvelope(SiteEnvelope envelope)
|
||||
{
|
||||
@@ -385,9 +349,6 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
return;
|
||||
}
|
||||
|
||||
// Track debug subscriptions for cleanup on disconnect
|
||||
TrackMessageForCleanup(envelope);
|
||||
|
||||
// Route via ClusterClient — Sender is preserved for Ask response routing
|
||||
entry.Client.Tell(
|
||||
new ClusterClient.Send("/user/site-communication", envelope.Message),
|
||||
@@ -485,23 +446,8 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
_log.Info("Site ClusterClient cache refreshed with {0} site(s)", _siteClients.Count);
|
||||
}
|
||||
|
||||
private void TrackMessageForCleanup(SiteEnvelope envelope)
|
||||
{
|
||||
switch (envelope.Message)
|
||||
{
|
||||
case Commons.Messages.DebugView.SubscribeDebugViewRequest sub:
|
||||
_debugSubscriptions[sub.CorrelationId] = (envelope.SiteId, Sender);
|
||||
break;
|
||||
|
||||
case Commons.Messages.DebugView.UnsubscribeDebugViewRequest unsub:
|
||||
_debugSubscriptions.Remove(unsub.CorrelationId);
|
||||
break;
|
||||
|
||||
case Commons.Messages.Deployment.DeployInstanceCommand deploy:
|
||||
_inProgressDeployments[deploy.DeploymentId] = envelope.SiteId;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Communication-016: TrackMessageForCleanup removed — the dicts it fed
|
||||
// existed solely to support the dead ConnectionStateChanged workflow.
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
@@ -547,11 +493,8 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_log.Info("CentralCommunicationActor stopped. In-progress deployments treated as failed (WP-5).");
|
||||
_log.Info("CentralCommunicationActor stopped");
|
||||
_refreshSchedule?.Cancel();
|
||||
// On central failover, all in-progress deployments are failed
|
||||
_inProgressDeployments.Clear();
|
||||
_debugSubscriptions.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -228,7 +228,8 @@ public class DeploymentService
|
||||
// logged loudly for operator reconciliation but must not flip
|
||||
// the already-committed Success record back to Failed.
|
||||
await ApplyPostSuccessSideEffectsAsync(
|
||||
instance, deploymentId, revisionHash, configJson, cancellationToken);
|
||||
instance, deploymentId, revisionHash, configJson,
|
||||
forceEnabledState: true, cancellationToken);
|
||||
}
|
||||
|
||||
// Audit log
|
||||
@@ -677,8 +678,22 @@ public class DeploymentService
|
||||
// the instance State to Enabled and store/refresh the deployed
|
||||
// config snapshot — otherwise the central state machine and the
|
||||
// deployed-snapshot invariant diverge from what the site is running.
|
||||
//
|
||||
// DeploymentManager-018: the reconciliation path runs only when the
|
||||
// prior record is InProgress or timeout-Failed — exactly the cases
|
||||
// that survive a central failover. The in-memory operation lock is
|
||||
// lost on failover, so an operator may have legitimately invoked
|
||||
// Disable on the instance between the original timed-out deploy and
|
||||
// this redeploy. Disable does not change the deployed config, so the
|
||||
// site still reports the target revision hash. Reconciliation must
|
||||
// therefore PRESERVE an intentional Disabled state instead of
|
||||
// silently flipping it back to Enabled — pass forceEnabledState:
|
||||
// false so the helper only promotes NotDeployed → Enabled (the
|
||||
// first-deploy-timed-out case) and leaves an explicit Disabled
|
||||
// alone.
|
||||
await ApplyPostSuccessSideEffectsAsync(
|
||||
instance, prior.DeploymentId, targetRevisionHash, configJson, cancellationToken);
|
||||
instance, prior.DeploymentId, targetRevisionHash, configJson,
|
||||
forceEnabledState: false, cancellationToken);
|
||||
|
||||
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
|
||||
instance.Id.ToString(), instance.UniqueName,
|
||||
@@ -713,6 +728,19 @@ public class DeploymentService
|
||||
/// deployed config snapshot (WP-8). Factored into one helper so the two
|
||||
/// paths cannot drift (DeploymentManager-015).
|
||||
///
|
||||
/// DeploymentManager-018: <paramref name="forceEnabledState"/> distinguishes
|
||||
/// the two callers. The normal deploy path passes <c>true</c> — a fresh
|
||||
/// successful apply legitimately puts the instance into <see cref="InstanceState.Enabled"/>
|
||||
/// (the documented "Deploy on a Disabled instance also enables it" semantics
|
||||
/// of <see cref="StateTransitionValidator"/>). The reconciliation path
|
||||
/// passes <c>false</c>: it is reconciling a *prior* deployment that may
|
||||
/// have completed before the current operator session (central failover
|
||||
/// loses the in-memory operation lock, so an operator may have legitimately
|
||||
/// Disabled the instance in between). On that path we only promote
|
||||
/// <see cref="InstanceState.NotDeployed"/> → <see cref="InstanceState.Enabled"/>
|
||||
/// (the first-deploy-timed-out case) and leave an explicit Disabled alone,
|
||||
/// so reconciliation never silently undoes a Disable.
|
||||
///
|
||||
/// Best-effort: the deployment record's terminal <see cref="DeploymentStatus.Success"/>
|
||||
/// status is already committed by the caller before this runs. A failure
|
||||
/// here is logged loudly for operator reconciliation but is NOT propagated —
|
||||
@@ -723,12 +751,20 @@ public class DeploymentService
|
||||
string deploymentId,
|
||||
string revisionHash,
|
||||
string configJson,
|
||||
bool forceEnabledState,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
// WP-4: Update instance state to Enabled on successful deployment
|
||||
instance.State = InstanceState.Enabled;
|
||||
// WP-4: Update instance state to Enabled on successful deployment.
|
||||
// DeploymentManager-018: on the reconciliation path
|
||||
// (forceEnabledState=false) only promote NotDeployed → Enabled,
|
||||
// preserving an intentional Disabled state set between the original
|
||||
// timed-out deploy and the redeploy.
|
||||
if (forceEnabledState || instance.State == InstanceState.NotDeployed)
|
||||
{
|
||||
instance.State = InstanceState.Enabled;
|
||||
}
|
||||
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
||||
|
||||
// WP-8: Store deployed config snapshot
|
||||
|
||||
@@ -148,7 +148,26 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
public async Task<bool> DeliverBufferedAsync(
|
||||
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<CachedWritePayload>(message.PayloadJson);
|
||||
// ExternalSystemGateway-018: a malformed (not just empty/null-fielded)
|
||||
// PayloadJson would otherwise throw `JsonException` here, which the S&F
|
||||
// engine treats as a transient failure and retries forever (poison
|
||||
// message). Re-running the same deserialization against the same payload
|
||||
// will throw deterministically, so JsonException is permanent — log,
|
||||
// and return false so the S&F engine parks the message instead.
|
||||
CachedWritePayload? payload;
|
||||
try
|
||||
{
|
||||
payload = JsonSerializer.Deserialize<CachedWritePayload>(message.PayloadJson);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Buffered CachedDbWrite message {Id} has malformed JSON payload; parking.",
|
||||
message.Id);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (payload == null || string.IsNullOrEmpty(payload.ConnectionName) || string.IsNullOrEmpty(payload.Sql))
|
||||
{
|
||||
_logger.LogError("Buffered CachedDbWrite message {Id} has an unreadable payload; parking.", message.Id);
|
||||
|
||||
@@ -173,7 +173,26 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
public async Task<bool> DeliverBufferedAsync(
|
||||
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<CachedCallPayload>(message.PayloadJson);
|
||||
// ExternalSystemGateway-018: a malformed (not just empty/null-fielded)
|
||||
// PayloadJson would otherwise throw `JsonException` here, which the S&F
|
||||
// engine treats as a transient failure and retries forever (poison
|
||||
// message). Re-running the same deserialization against the same payload
|
||||
// will throw deterministically, so JsonException is permanent — log,
|
||||
// and return false so the S&F engine parks the message instead.
|
||||
CachedCallPayload? payload;
|
||||
try
|
||||
{
|
||||
payload = JsonSerializer.Deserialize<CachedCallPayload>(message.PayloadJson);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Buffered ExternalSystem message {Id} has malformed JSON payload; parking.",
|
||||
message.Id);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (payload == null || string.IsNullOrEmpty(payload.SystemName) || string.IsNullOrEmpty(payload.MethodName))
|
||||
{
|
||||
_logger.LogError("Buffered ExternalSystem message {Id} has an unreadable payload; parking.", message.Id);
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
using Akka.Cluster;
|
||||
using ScadaLink.Host.Actors;
|
||||
using ScadaLink.InboundAPI;
|
||||
|
||||
namespace ScadaLink.Host.Health;
|
||||
|
||||
/// <summary>
|
||||
/// InboundAPI-008 / InboundAPI-022: production implementation of
|
||||
/// <see cref="IActiveNodeGate"/> backed by the running Akka.NET cluster.
|
||||
///
|
||||
/// The inbound API is "Central cluster only (active node)" — a standby central
|
||||
/// node must not execute method scripts or <c>Route.To()</c> calls. This gate
|
||||
/// mirrors the leadership check in <see cref="ActiveNodeHealthCheck"/> (the
|
||||
/// node is the cluster leader, <see cref="MemberStatus.Up"/>), so
|
||||
/// <see cref="InboundApiEndpointFilter"/> can return HTTP 503 on a standby.
|
||||
///
|
||||
/// Registered only in the Central-role branch of <c>Program.cs</c>. The gate
|
||||
/// is resolved per request from <c>HttpContext.RequestServices</c>; while the
|
||||
/// <c>AkkaHostedService</c> is still warming up (<c>ActorSystem == null</c>)
|
||||
/// or the node has not yet reached <see cref="MemberStatus.Up"/>, this
|
||||
/// implementation reports <c>IsActiveNode == false</c> — the safe-by-default
|
||||
/// answer matching the standby case.
|
||||
/// </summary>
|
||||
public sealed class ActiveNodeGate : IActiveNodeGate
|
||||
{
|
||||
private readonly AkkaHostedService _akkaService;
|
||||
|
||||
/// <summary>Initializes a new <see cref="ActiveNodeGate"/> bound to the given Akka hosted service.</summary>
|
||||
/// <param name="akkaService">The Akka hosted service exposing the cluster's <see cref="Akka.Actor.ActorSystem"/>.</param>
|
||||
public ActiveNodeGate(AkkaHostedService akkaService)
|
||||
{
|
||||
_akkaService = akkaService;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// <c>true</c> only when this node has joined the cluster (<see cref="MemberStatus.Up"/>)
|
||||
/// AND is the current cluster leader; <c>false</c> in every other state
|
||||
/// (actor system not yet started, node still joining, node is a standby).
|
||||
/// </summary>
|
||||
public bool IsActiveNode
|
||||
{
|
||||
get
|
||||
{
|
||||
var system = _akkaService.ActorSystem;
|
||||
if (system == null)
|
||||
return false;
|
||||
|
||||
var cluster = Cluster.Get(system);
|
||||
var self = cluster.SelfMember;
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return false;
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
return leader != null && leader == self.Address;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -120,6 +120,16 @@ try
|
||||
builder.Services.AddSingleton<AkkaHostedService>();
|
||||
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
||||
|
||||
// InboundAPI-022: register the production IActiveNodeGate implementation so
|
||||
// standby-node gating is actually enforced (the InboundApiEndpointFilter
|
||||
// consults IActiveNodeGate and defaults to "allow" when none is registered,
|
||||
// which leaves the design's "central cluster only (active node)" guarantee
|
||||
// unenforced in deployed binaries). The gate is backed by the same Akka
|
||||
// cluster-leadership check as ActiveNodeHealthCheck above, so the inbound
|
||||
// API and the /health/active endpoint Traefik routes against agree on
|
||||
// which node is active.
|
||||
builder.Services.AddSingleton<ScadaLink.InboundAPI.IActiveNodeGate, ActiveNodeGate>();
|
||||
|
||||
// Cluster node status provider scoped to the Central role — feeds the
|
||||
// CentralHealthReportLoop so the central cluster appears on /monitoring/health.
|
||||
builder.Services.AddSingleton<IClusterNodeProvider>(sp =>
|
||||
|
||||
@@ -34,8 +34,7 @@ public static class SiteServiceRegistration
|
||||
// Sites no longer deliver notifications over SMTP — a buffered notification is
|
||||
// forwarded to the central cluster (via NotificationForwarder / SiteCommunicationActor),
|
||||
// and central owns SMTP delivery through the Notification Outbox. The SMTP machinery
|
||||
// (OAuth2TokenService, ISmtpClientWrapper, INotificationDeliveryService) has no
|
||||
// consumer on a site node.
|
||||
// (OAuth2TokenService, ISmtpClientWrapper) has no consumer on a site node.
|
||||
|
||||
// Health report transport: sends SiteHealthReport to SiteCommunicationActor via Akka
|
||||
services.AddSingleton<ISiteIdentityProvider, SiteIdentityProvider>();
|
||||
|
||||
@@ -10,14 +10,14 @@ namespace ScadaLink.NotificationOutbox.Delivery;
|
||||
/// <summary>
|
||||
/// Task 12: Email channel delivery adapter for the central notification outbox.
|
||||
///
|
||||
/// Reuses the <see cref="ScadaLink.NotificationService"/> SMTP machinery —
|
||||
/// Reuses the <see cref="ScadaLink.NotificationService"/> SMTP primitives —
|
||||
/// <see cref="ISmtpClientWrapper"/>, <see cref="SmtpTlsModeParser"/>,
|
||||
/// <see cref="OAuth2TokenService"/> and the typed <see cref="SmtpPermanentException"/>.
|
||||
/// The connect/auth/send/disconnect sequence and error classification mirror
|
||||
/// <c>NotificationDeliveryService.DeliverAsync</c>; this adapter, however, maps the
|
||||
/// result to the outbox's three-way <see cref="DeliveryOutcome"/> (Success / Permanent
|
||||
/// / Transient) rather than the S&F-coupled <c>NotificationResult</c>, which cannot
|
||||
/// distinguish a permanent failure from a buffered transient one.
|
||||
/// This adapter owns the full connect/auth/send/disconnect sequence and maps the
|
||||
/// outcome to the outbox's three-way <see cref="DeliveryOutcome"/> (Success / Permanent /
|
||||
/// Transient) — the canonical central-side email delivery path. NS-019: the prior
|
||||
/// site-shaped <c>NotificationDeliveryService</c> was deleted with sites no longer
|
||||
/// delivering notifications.
|
||||
/// </summary>
|
||||
public sealed class EmailNotificationDeliveryAdapter : INotificationDeliveryAdapter
|
||||
{
|
||||
@@ -44,9 +44,8 @@ public sealed class EmailNotificationDeliveryAdapter : INotificationDeliveryAdap
|
||||
_smtpClientFactory = smtpClientFactory;
|
||||
_logger = logger;
|
||||
_tokenService = tokenService;
|
||||
// Mirrors NotificationDeliveryService: NotificationOptions supplies the
|
||||
// documented fallback values used when a deployed SmtpConfiguration row
|
||||
// leaves a field unset (non-positive).
|
||||
// NotificationOptions supplies the documented fallback values used when a
|
||||
// deployed SmtpConfiguration row leaves a field unset (non-positive).
|
||||
_options = options?.Value ?? new NotificationOptions();
|
||||
}
|
||||
|
||||
@@ -81,7 +80,7 @@ public sealed class EmailNotificationDeliveryAdapter : INotificationDeliveryAdap
|
||||
}
|
||||
|
||||
// An unknown TLS mode is a configuration error that retrying cannot fix —
|
||||
// surface it as a permanent failure (mirrors NS-005 in NotificationDeliveryService).
|
||||
// surface it as a permanent failure (NS-005 SMTP TLS validation policy).
|
||||
SmtpTlsMode tlsMode;
|
||||
try
|
||||
{
|
||||
@@ -154,11 +153,9 @@ public sealed class EmailNotificationDeliveryAdapter : INotificationDeliveryAdap
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Delivers the plain-text BCC email via SMTP. Mirrors the connect/auth/send/
|
||||
/// disconnect sequence of <c>NotificationDeliveryService.DeliverAsync</c>: a
|
||||
/// permanent failure surfaces as <see cref="SmtpPermanentException"/>; transient
|
||||
/// failures propagate for the caller's classifier; the connection is always torn
|
||||
/// down in the finally block.
|
||||
/// Delivers the plain-text BCC email via SMTP. A permanent failure surfaces as
|
||||
/// <see cref="SmtpPermanentException"/>; transient failures propagate for the
|
||||
/// caller's classifier; the connection is always torn down in the finally block.
|
||||
/// </summary>
|
||||
private async Task SendAsync(
|
||||
SmtpConfiguration config,
|
||||
|
||||
@@ -57,7 +57,7 @@ public class MailKitSmtpClientWrapper : ISmtpClientWrapper, IDisposable
|
||||
// worst, sending where authentication was required. Authentication being
|
||||
// skipped must never be silent: each of these is a permanent configuration
|
||||
// fault, surfaced as SmtpPermanentException so SendAsync returns a clean
|
||||
// failure and DeliverBufferedAsync parks the buffered message.
|
||||
// failure that the central Notification Outbox dispatcher classifies as permanent.
|
||||
if (string.IsNullOrEmpty(credentials))
|
||||
{
|
||||
throw new SmtpPermanentException(
|
||||
|
||||
@@ -1,448 +0,0 @@
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.Notifications;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
using ScadaLink.StoreAndForward;
|
||||
|
||||
namespace ScadaLink.NotificationService;
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Notification delivery via SMTP.
|
||||
/// WP-12: Error classification and S&F integration.
|
||||
/// Transient: connection refused, timeout, SMTP 4xx → hand to S&F.
|
||||
/// Permanent: SMTP 5xx → returned to script.
|
||||
/// </summary>
|
||||
public class NotificationDeliveryService : INotificationDeliveryService, IDisposable
|
||||
{
|
||||
private readonly INotificationRepository _repository;
|
||||
private readonly Func<ISmtpClientWrapper> _smtpClientFactory;
|
||||
private readonly OAuth2TokenService? _tokenService;
|
||||
private readonly StoreAndForwardService? _storeAndForward;
|
||||
private readonly ILogger<NotificationDeliveryService> _logger;
|
||||
private readonly NotificationOptions _options;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the NotificationDeliveryService with the specified dependencies.
|
||||
/// </summary>
|
||||
/// <param name="repository">The notification repository for data access.</param>
|
||||
/// <param name="smtpClientFactory">Factory for creating SMTP client instances.</param>
|
||||
/// <param name="logger">Logger for diagnostic messages.</param>
|
||||
/// <param name="tokenService">Optional OAuth2 token service for authentication.</param>
|
||||
/// <param name="storeAndForward">Optional store-and-forward service for handling transient failures.</param>
|
||||
/// <param name="options">Optional notification options with fallback values.</param>
|
||||
public NotificationDeliveryService(
|
||||
INotificationRepository repository,
|
||||
Func<ISmtpClientWrapper> smtpClientFactory,
|
||||
ILogger<NotificationDeliveryService> logger,
|
||||
OAuth2TokenService? tokenService = null,
|
||||
StoreAndForwardService? storeAndForward = null,
|
||||
IOptions<NotificationOptions>? options = null)
|
||||
{
|
||||
_repository = repository;
|
||||
_smtpClientFactory = smtpClientFactory;
|
||||
_logger = logger;
|
||||
_tokenService = tokenService;
|
||||
_storeAndForward = storeAndForward;
|
||||
// NS-017: NotificationOptions supplies the documented fallback values used
|
||||
// when a deployed SmtpConfiguration row leaves a field unset (non-positive).
|
||||
_options = options?.Value ?? new NotificationOptions();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<NotificationResult> SendAsync(
|
||||
string listName,
|
||||
string subject,
|
||||
string message,
|
||||
string? originInstanceName = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var list = await _repository.GetListByNameAsync(listName, cancellationToken);
|
||||
if (list == null)
|
||||
{
|
||||
return new NotificationResult(false, $"Notification list '{listName}' not found");
|
||||
}
|
||||
|
||||
var recipients = await _repository.GetRecipientsByListIdAsync(list.Id, cancellationToken);
|
||||
if (recipients.Count == 0)
|
||||
{
|
||||
return new NotificationResult(false, $"Notification list '{listName}' has no recipients");
|
||||
}
|
||||
|
||||
var smtpConfigs = await _repository.GetAllSmtpConfigurationsAsync(cancellationToken);
|
||||
var smtpConfig = smtpConfigs.FirstOrDefault();
|
||||
if (smtpConfig == null)
|
||||
{
|
||||
return new NotificationResult(false, "No SMTP configuration available");
|
||||
}
|
||||
|
||||
// NS-005: validate the configured TLS mode up front — an unknown value is a
|
||||
// configuration error and must surface as a clean result, not a silent
|
||||
// fallback to opportunistic TLS negotiation.
|
||||
try
|
||||
{
|
||||
SmtpTlsModeParser.Parse(smtpConfig.TlsMode);
|
||||
}
|
||||
catch (ArgumentException ex)
|
||||
{
|
||||
_logger.LogError("Invalid SMTP TLS mode for list {List}: {Reason}", listName, ex.Message);
|
||||
return new NotificationResult(false, ex.Message);
|
||||
}
|
||||
|
||||
// NS-008: validate every email address before attempting delivery. A single
|
||||
// malformed address previously caused MailboxAddress.Parse to throw a
|
||||
// ParseException that escaped SendAsync unhandled; it must instead produce a
|
||||
// clean NotificationResult the calling script can handle.
|
||||
var addressError = EmailAddressValidator.ValidateAddresses(smtpConfig.FromAddress, recipients);
|
||||
if (addressError != null)
|
||||
{
|
||||
_logger.LogWarning("Notification to list {List} has invalid addresses: {Reason}", listName, addressError);
|
||||
return new NotificationResult(false, addressError);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await DeliverAsync(smtpConfig, recipients, subject, message, cancellationToken);
|
||||
return new NotificationResult(true, null);
|
||||
}
|
||||
catch (SmtpPermanentException ex)
|
||||
{
|
||||
// WP-12: Permanent SMTP failure — returned to script.
|
||||
// NS-009: scrub credential fragments out of the server-supplied message
|
||||
// before logging or returning it.
|
||||
var detail = CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials);
|
||||
_logger.LogError(
|
||||
"Permanent SMTP failure sending to list {List}: {Detail}", listName, detail);
|
||||
return new NotificationResult(false, $"Permanent SMTP error: {detail}");
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// NS-002: a caller-requested cancellation propagates; it is not buffered.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex) when (SmtpErrorClassifier.IsTransient(ex, cancellationToken))
|
||||
{
|
||||
// WP-12: Transient SMTP failure — hand to S&F.
|
||||
// NS-009: scrub credential fragments before logging.
|
||||
_logger.LogWarning(
|
||||
"Transient SMTP failure sending to list {List} ({ExceptionType}): {Detail}; buffering for retry",
|
||||
listName, ex.GetType().Name, CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
|
||||
|
||||
if (_storeAndForward == null)
|
||||
{
|
||||
return new NotificationResult(false, "Transient SMTP error and store-and-forward not available");
|
||||
}
|
||||
|
||||
var payload = JsonSerializer.Serialize(new
|
||||
{
|
||||
ListName = listName,
|
||||
Subject = subject,
|
||||
Message = message
|
||||
});
|
||||
|
||||
// attemptImmediateDelivery: false — DeliverAsync was already attempted
|
||||
// above; letting EnqueueAsync re-invoke the handler would send twice.
|
||||
await _storeAndForward.EnqueueAsync(
|
||||
StoreAndForwardCategory.Notification,
|
||||
listName,
|
||||
payload,
|
||||
originInstanceName,
|
||||
smtpConfig.MaxRetries > 0 ? smtpConfig.MaxRetries : null,
|
||||
smtpConfig.RetryDelay > TimeSpan.Zero ? smtpConfig.RetryDelay : null,
|
||||
attemptImmediateDelivery: false);
|
||||
|
||||
return new NotificationResult(true, null, WasBuffered: true);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// NS-015: a failure that SmtpErrorClassifier does not recognise (Unknown) —
|
||||
// most importantly an OAuth2 token-fetch failure (HttpRequestException
|
||||
// from EnsureSuccessStatusCode, or InvalidOperationException from a
|
||||
// malformed credential triple) — used to fall through all the catch
|
||||
// clauses above and escape SendAsync as a raw exception to the calling
|
||||
// script, which the INotificationDeliveryService contract never
|
||||
// advertises. Convert any otherwise-unhandled exception into a clean,
|
||||
// credential-scrubbed permanent NotificationResult: returning control to
|
||||
// the script is the safe default. (A caller-requested cancellation is
|
||||
// already re-thrown by the filter above and never reaches here.)
|
||||
var detail = CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials);
|
||||
_logger.LogError(
|
||||
"Unclassified failure sending to list {List} ({ExceptionType}): {Detail}",
|
||||
listName, ex.GetType().Name, detail);
|
||||
return new NotificationResult(false, $"Notification delivery failed: {detail}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11/12: Delivers a buffered notification during a store-and-forward retry
|
||||
/// sweep — re-resolves the list, recipients and SMTP config and re-attempts
|
||||
/// delivery. Returns true on success, false on permanent failure (the message
|
||||
/// is parked); throws on a transient failure so the engine retries.
|
||||
/// </summary>
|
||||
/// <param name="message">The buffered store-and-forward message to deliver.</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the delivery attempt.</param>
|
||||
public async Task<bool> DeliverBufferedAsync(
|
||||
StoreAndForwardMessage message, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<BufferedNotification>(message.PayloadJson);
|
||||
if (payload == null || string.IsNullOrEmpty(payload.ListName))
|
||||
{
|
||||
_logger.LogError("Buffered notification message {Id} has an unreadable payload; parking.", message.Id);
|
||||
return false;
|
||||
}
|
||||
|
||||
var list = await _repository.GetListByNameAsync(payload.ListName, cancellationToken);
|
||||
if (list == null)
|
||||
{
|
||||
_logger.LogError(
|
||||
"Buffered notification to list '{List}' cannot be delivered — the list no longer exists; parking.",
|
||||
payload.ListName);
|
||||
return false;
|
||||
}
|
||||
|
||||
var recipients = await _repository.GetRecipientsByListIdAsync(list.Id, cancellationToken);
|
||||
if (recipients.Count == 0)
|
||||
{
|
||||
_logger.LogError("Buffered notification to list '{List}' has no recipients; parking.", payload.ListName);
|
||||
return false;
|
||||
}
|
||||
|
||||
var smtpConfig = (await _repository.GetAllSmtpConfigurationsAsync(cancellationToken)).FirstOrDefault();
|
||||
if (smtpConfig == null)
|
||||
{
|
||||
_logger.LogError("Buffered notification cannot be delivered — no SMTP configuration available; parking.");
|
||||
return false;
|
||||
}
|
||||
|
||||
// NS-005: an unknown TLS mode is a configuration error that retrying cannot
|
||||
// fix — park the buffered message rather than throwing on every sweep.
|
||||
try
|
||||
{
|
||||
SmtpTlsModeParser.Parse(smtpConfig.TlsMode);
|
||||
}
|
||||
catch (ArgumentException ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
"Buffered notification to list '{List}' cannot be delivered — {Reason}; parking.",
|
||||
payload.ListName, ex.Message);
|
||||
return false;
|
||||
}
|
||||
|
||||
// NS-008: a malformed address cannot be fixed by retrying — park it.
|
||||
var addressError = EmailAddressValidator.ValidateAddresses(smtpConfig.FromAddress, recipients);
|
||||
if (addressError != null)
|
||||
{
|
||||
_logger.LogError(
|
||||
"Buffered notification to list '{List}' has invalid addresses ({Reason}); parking.",
|
||||
payload.ListName, addressError);
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await DeliverAsync(smtpConfig, recipients, payload.Subject, payload.Message, cancellationToken);
|
||||
return true;
|
||||
}
|
||||
catch (SmtpPermanentException ex)
|
||||
{
|
||||
// NS-009: scrub credential fragments out of the message before logging.
|
||||
_logger.LogError(
|
||||
"Buffered notification to list '{List}' failed permanently ({Detail}); parking.",
|
||||
payload.ListName, CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
|
||||
return false;
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// A handler shutdown cancellation is neither a delivery success nor a
|
||||
// permanent failure — let it propagate so the engine does not park.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex) when (SmtpErrorClassifier.IsTransient(ex, cancellationToken))
|
||||
{
|
||||
// A typed transient SMTP error: re-throw so the S&F engine retries.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// NS-014: an exception SmtpErrorClassifier does not recognise (Unknown) —
|
||||
// chiefly an OAuth2 token-fetch failure — used to escape this handler.
|
||||
// The S&F engine treats ANY thrown exception as transient, so a
|
||||
// permanently-broken config (bad client secret, malformed credential
|
||||
// triple) was retried on every sweep until MaxRetries, burning token
|
||||
// endpoint calls. Decide deliberately rather than letting it leak:
|
||||
// - an HttpRequestException with a 5xx token-endpoint status is a
|
||||
// transient outage → re-throw so the engine retries;
|
||||
// - everything else (a 4xx/401 token rejection, a malformed credential
|
||||
// InvalidOperationException, any other unclassified fault) is not
|
||||
// fixable by retrying → return false so the message is parked.
|
||||
if (ex is HttpRequestException { StatusCode: { } status } && (int)status is >= 500 and < 600)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Buffered notification to list '{List}' hit a transient OAuth2 token-endpoint error ({Status}); will retry.",
|
||||
payload.ListName, (int)status);
|
||||
throw;
|
||||
}
|
||||
|
||||
_logger.LogError(
|
||||
"Buffered notification to list '{List}' failed with a non-retryable error ({ExceptionType}: {Detail}); parking.",
|
||||
payload.ListName, ex.GetType().Name,
|
||||
CredentialRedactor.Scrub(ex.Message, smtpConfig.Credentials));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record BufferedNotification(string ListName, string Subject, string Message);
|
||||
|
||||
/// <summary>
|
||||
/// NS-007: throttles concurrent SMTP deliveries to the configured
|
||||
/// <c>MaxConcurrentConnections</c>. One SMTP config is deployed per site, so the
|
||||
/// limit is a stable per-site invariant; it is captured lazily on first use.
|
||||
/// NS-018: a <see cref="Lazy{T}"/> replaces the hand-rolled double-checked
|
||||
/// init — its publication is correctly synchronised (no lock-free read of a
|
||||
/// non-volatile field) and it is disposed in <see cref="Dispose"/>.
|
||||
/// </summary>
|
||||
private Lazy<SemaphoreSlim>? _concurrencyLimiter;
|
||||
private readonly object _limiterLock = new();
|
||||
private bool _disposed;
|
||||
|
||||
private SemaphoreSlim GetConcurrencyLimiter(SmtpConfiguration config)
|
||||
{
|
||||
// NS-018: the limiter is sized once; capture the size now so the Lazy
|
||||
// factory does not close over a value that could change between calls.
|
||||
var configured = config.MaxConcurrentConnections > 0
|
||||
? config.MaxConcurrentConnections
|
||||
// NS-017: fall back to the NotificationOptions value, then the
|
||||
// design-doc default of 5, when the deployed row leaves it unset.
|
||||
: _options.MaxConcurrentConnections > 0 ? _options.MaxConcurrentConnections : 5;
|
||||
|
||||
lock (_limiterLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
_concurrencyLimiter ??= new Lazy<SemaphoreSlim>(
|
||||
() => new SemaphoreSlim(configured, configured));
|
||||
return _concurrencyLimiter.Value;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// NS-018: disposes the lazily-created concurrency limiter. The service is a
|
||||
/// scoped DI service; without this the <see cref="SemaphoreSlim"/> leaked a
|
||||
/// handle per scope.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
lock (_limiterLock)
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_disposed = true;
|
||||
if (_concurrencyLimiter is { IsValueCreated: true } limiter)
|
||||
{
|
||||
limiter.Value.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
GC.SuppressFinalize(this);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Delivers an email via SMTP. Throws on failure (transient errors and
|
||||
/// <see cref="SmtpPermanentException"/> propagate; the caller classifies them).
|
||||
/// </summary>
|
||||
/// <param name="config">The SMTP configuration to use for the connection.</param>
|
||||
/// <param name="recipients">The list of recipients to deliver to.</param>
|
||||
/// <param name="subject">The email subject line.</param>
|
||||
/// <param name="body">The plain-text email body.</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the delivery.</param>
|
||||
internal async Task DeliverAsync(
|
||||
SmtpConfiguration config,
|
||||
IReadOnlyList<NotificationRecipient> recipients,
|
||||
string subject,
|
||||
string body,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var tlsMode = SmtpTlsModeParser.Parse(config.TlsMode);
|
||||
|
||||
// NS-007: bound the number of concurrent SMTP connections per site.
|
||||
var limiter = GetConcurrencyLimiter(config);
|
||||
await limiter.WaitAsync(cancellationToken);
|
||||
|
||||
// NS-004: create exactly one client and dispose the one actually used.
|
||||
var smtp = _smtpClientFactory();
|
||||
using var disposable = smtp as IDisposable;
|
||||
|
||||
try
|
||||
{
|
||||
// NS-005/NS-007: explicit TLS mode and the configured connection timeout.
|
||||
// NS-017: when the deployed SmtpConfiguration row leaves the timeout
|
||||
// unset (non-positive), fall back to the NotificationOptions value.
|
||||
var timeoutSeconds = config.ConnectionTimeoutSeconds > 0
|
||||
? config.ConnectionTimeoutSeconds
|
||||
: _options.ConnectionTimeoutSeconds;
|
||||
await smtp.ConnectAsync(
|
||||
config.Host, config.Port, tlsMode, timeoutSeconds, cancellationToken);
|
||||
|
||||
// Resolve credentials (OAuth2 token fetched/cached by the token service).
|
||||
var credentials = config.Credentials;
|
||||
if (config.AuthType.Equals("oauth2", StringComparison.OrdinalIgnoreCase) && _tokenService != null && credentials != null)
|
||||
{
|
||||
var token = await _tokenService.GetTokenAsync(credentials, cancellationToken);
|
||||
credentials = token;
|
||||
}
|
||||
|
||||
// NS-021: OAuth2 XOAUTH2 requires the user identity (FromAddress) to be
|
||||
// sent alongside the access token; an empty user is rejected by M365.
|
||||
await smtp.AuthenticateAsync(
|
||||
config.AuthType,
|
||||
credentials,
|
||||
oauth2UserName: config.FromAddress,
|
||||
cancellationToken: cancellationToken);
|
||||
|
||||
var bccAddresses = recipients.Select(r => r.EmailAddress).ToList();
|
||||
await smtp.SendAsync(config.FromAddress, bccAddresses, subject, body, cancellationToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// NS-002: A deliberately cancelled token must propagate as a cancellation,
|
||||
// not be misclassified as a transient SMTP failure and buffered for retry.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex) when (SmtpErrorClassifier.Classify(ex, cancellationToken) == SmtpErrorClass.Permanent
|
||||
&& ex is not SmtpPermanentException)
|
||||
{
|
||||
// NS-003: Permanent SMTP failure (5xx) — surface a typed permanent exception.
|
||||
throw new SmtpPermanentException(ex.Message, ex);
|
||||
}
|
||||
// Transient and SmtpPermanentException both propagate unchanged: SendAsync's
|
||||
// catch filters (SmtpPermanentException / SmtpErrorClassifier.IsTransient) handle them.
|
||||
finally
|
||||
{
|
||||
// NS-010: always tear the connection down, regardless of outcome. The
|
||||
// SMTP QUIT used to run only on the success path inside the try block,
|
||||
// so a failed Connect/Authenticate/Send left an open, authenticated
|
||||
// connection until finalization reclaimed the socket — exhausting the
|
||||
// server's connection slots under sustained transient failures.
|
||||
// Disconnect is best-effort: a disconnect failure (e.g. the connection
|
||||
// is already dead) must not mask the original delivery exception.
|
||||
try
|
||||
{
|
||||
await smtp.DisconnectAsync(cancellationToken);
|
||||
}
|
||||
catch (Exception disconnectEx)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Ignoring SMTP disconnect failure during cleanup: {Reason}", disconnectEx.Message);
|
||||
}
|
||||
|
||||
// NS-007: always release the concurrency slot, even on failure.
|
||||
limiter.Release();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5,10 +5,10 @@ namespace ScadaLink.NotificationService;
|
||||
/// <c>ScadaLink:Notification</c> configuration section.
|
||||
///
|
||||
/// SMTP settings are primarily carried by the deployed <c>SmtpConfiguration</c>
|
||||
/// entity. NS-017: these values are the fallback used by
|
||||
/// <see cref="NotificationDeliveryService"/> when the corresponding
|
||||
/// <c>SmtpConfiguration</c> field is left unset (non-positive) on a partially
|
||||
/// deployed row — a value present on the row always takes precedence.
|
||||
/// entity. NS-017: these values are the fallback used by the central
|
||||
/// Notification Outbox's <c>EmailNotificationDeliveryAdapter</c> when the
|
||||
/// corresponding <c>SmtpConfiguration</c> field is left unset (non-positive) on a
|
||||
/// partially deployed row — a value present on the row always takes precedence.
|
||||
/// </summary>
|
||||
public class NotificationOptions
|
||||
{
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
namespace ScadaLink.NotificationService;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Registers the notification delivery services (SMTP, OAuth2 token, delivery adapter).
|
||||
/// Registers the shared SMTP delivery primitives consumed by the central Notification
|
||||
/// Outbox's <c>EmailNotificationDeliveryAdapter</c>: <see cref="NotificationOptions"/>,
|
||||
/// <see cref="OAuth2TokenService"/>, and the <see cref="ISmtpClientWrapper"/> factory.
|
||||
/// Central-only — sites no longer deliver notifications (see
|
||||
/// <c>Component-NotificationService.md</c>), and the orphaned site-shaped
|
||||
/// <c>NotificationDeliveryService</c> + <c>INotificationDeliveryService</c> contract
|
||||
/// was removed (NS-019). Notification dispatch lives in <c>ScadaLink.NotificationOutbox</c>.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
public static IServiceCollection AddNotificationService(this IServiceCollection services)
|
||||
@@ -17,8 +22,6 @@ public static class ServiceCollectionExtensions
|
||||
services.AddHttpClient();
|
||||
services.AddSingleton<OAuth2TokenService>();
|
||||
services.AddSingleton<Func<ISmtpClientWrapper>>(_ => () => new MailKitSmtpClientWrapper());
|
||||
services.AddScoped<NotificationDeliveryService>();
|
||||
services.AddScoped<INotificationDeliveryService>(sp => sp.GetRequiredService<NotificationDeliveryService>());
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
@@ -26,10 +26,10 @@ public enum SmtpErrorClass
|
||||
/// the numeric <see cref="SmtpStatusCode"/> rather than locale-dependent substring
|
||||
/// matching on the exception message.
|
||||
/// <para>
|
||||
/// Public and shared: both <see cref="NotificationDeliveryService"/> (store-and-forward
|
||||
/// delivery) and the central Notification Outbox's <c>EmailNotificationDeliveryAdapter</c>
|
||||
/// route every SMTP failure through this single policy, so a transient/permanent
|
||||
/// boundary change cannot diverge between the two delivery paths.
|
||||
/// Public and shared: the central Notification Outbox's <c>EmailNotificationDeliveryAdapter</c>
|
||||
/// routes every SMTP failure through this single policy. (NS-019: the orphaned site-side
|
||||
/// <c>NotificationDeliveryService</c> that previously co-used this classifier was removed
|
||||
/// when sites stopped delivering notifications.)
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public static class SmtpErrorClassifier
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
using System.Text.Json;
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ScadaLink.Commons.Messages.Notification;
|
||||
|
||||
namespace ScadaLink.StoreAndForward;
|
||||
@@ -31,6 +33,7 @@ public sealed class NotificationForwarder
|
||||
private readonly IActorRef _siteCommunicationActor;
|
||||
private readonly string _sourceSiteId;
|
||||
private readonly TimeSpan _forwardTimeout;
|
||||
private readonly ILogger<NotificationForwarder> _logger;
|
||||
|
||||
/// <param name="siteCommunicationActor">
|
||||
/// The site communication actor. It forwards a <see cref="NotificationSubmit"/> to
|
||||
@@ -42,14 +45,21 @@ public sealed class NotificationForwarder
|
||||
/// How long to wait for central's ack before treating the forward as a transient
|
||||
/// failure. Sourced from host configuration.
|
||||
/// </param>
|
||||
/// <param name="logger">
|
||||
/// Optional logger. StoreAndForward-018: a corrupt buffered payload is logged at
|
||||
/// Warning before being discarded so an operator has a forensic trail of the row
|
||||
/// that vanished from the buffer.
|
||||
/// </param>
|
||||
public NotificationForwarder(
|
||||
IActorRef siteCommunicationActor,
|
||||
string sourceSiteId,
|
||||
TimeSpan forwardTimeout)
|
||||
TimeSpan forwardTimeout,
|
||||
ILogger<NotificationForwarder>? logger = null)
|
||||
{
|
||||
_siteCommunicationActor = siteCommunicationActor;
|
||||
_sourceSiteId = sourceSiteId;
|
||||
_forwardTimeout = forwardTimeout;
|
||||
_logger = logger ?? NullLogger<NotificationForwarder>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -61,11 +71,26 @@ public sealed class NotificationForwarder
|
||||
/// <param name="message">The buffered store-and-forward message to deliver to central.</param>
|
||||
public async Task<bool> DeliverAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
// An unreadable payload cannot be fixed by retrying — park it (return false),
|
||||
// mirroring how the former SMTP handler treated a corrupt buffered payload.
|
||||
// StoreAndForward-018: an unreadable payload cannot be fixed by retrying.
|
||||
// The design doc explicitly forbids parking notifications ("notifications do
|
||||
// not park — they are retried at the fixed forward interval until central
|
||||
// acks"; Component-StoreAndForward.md). The earlier behaviour returned false
|
||||
// here, which the S&F engine interprets as a permanent failure and parks
|
||||
// the row — contradicting the invariant and surfacing the row in the
|
||||
// central UI's parked-message list. The correct outcome for a corrupt-payload
|
||||
// notification is to DISCARD: log a Warning with the buffered row id +
|
||||
// payload preview for forensics, then return true so the engine clears the
|
||||
// buffer via its standard success-path cleanup. The buffered row is
|
||||
// unrecoverable; retrying or parking would both make the queue worse, not
|
||||
// better.
|
||||
if (!TryBuildSubmit(message, out var submit))
|
||||
{
|
||||
return false;
|
||||
_logger.LogWarning(
|
||||
"Discarding corrupt buffered notification {NotificationId} (payload is not deserialisable as NotificationSubmit). " +
|
||||
"Payload preview: {PayloadPreview}",
|
||||
message.Id,
|
||||
PreviewPayload(message.PayloadJson));
|
||||
return true;
|
||||
}
|
||||
|
||||
// The reply may legitimately be a non-accepted ack, so it is not requested as
|
||||
@@ -140,6 +165,25 @@ public sealed class NotificationForwarder
|
||||
};
|
||||
return true;
|
||||
}
|
||||
|
||||
private const int CorruptPayloadPreviewMaxLength = 200;
|
||||
|
||||
/// <summary>
|
||||
/// Returns a length-capped preview of a corrupt buffered payload for the Warning
|
||||
/// log line emitted on discard. The full payload may be megabytes and is not
|
||||
/// suitable for the structured log; the preview retains the leading characters,
|
||||
/// which is what an operator typically uses to identify the producing script.
|
||||
/// </summary>
|
||||
private static string PreviewPayload(string? payloadJson)
|
||||
{
|
||||
if (string.IsNullOrEmpty(payloadJson))
|
||||
{
|
||||
return "<empty>";
|
||||
}
|
||||
return payloadJson.Length <= CorruptPayloadPreviewMaxLength
|
||||
? payloadJson
|
||||
: payloadJson.Substring(0, CorruptPayloadPreviewMaxLength) + "…";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -111,12 +111,14 @@ public class DiffService
|
||||
a.CanonicalName == b.CanonicalName &&
|
||||
a.Value == b.Value &&
|
||||
a.DataType == b.DataType &&
|
||||
a.Description == b.Description &&
|
||||
a.IsLocked == b.IsLocked &&
|
||||
a.DataSourceReference == b.DataSourceReference &&
|
||||
a.BoundDataConnectionId == b.BoundDataConnectionId;
|
||||
|
||||
private static bool AlarmsEqual(ResolvedAlarm a, ResolvedAlarm b) =>
|
||||
a.CanonicalName == b.CanonicalName &&
|
||||
a.Description == b.Description &&
|
||||
a.PriorityLevel == b.PriorityLevel &&
|
||||
a.IsLocked == b.IsLocked &&
|
||||
a.TriggerType == b.TriggerType &&
|
||||
@@ -132,4 +134,27 @@ public class DiffService
|
||||
a.ParameterDefinitions == b.ParameterDefinitions &&
|
||||
a.ReturnDefinition == b.ReturnDefinition &&
|
||||
a.MinTimeBetweenRuns == b.MinTimeBetweenRuns;
|
||||
|
||||
/// <summary>
|
||||
/// Compares two <see cref="ConnectionConfig"/> instances for equality across
|
||||
/// the fields that travel in the deployment package: protocol, primary and
|
||||
/// backup configuration JSON, and failover retry count. Used by callers that
|
||||
/// need to detect connection-endpoint drift; the public diff shape only
|
||||
/// exposes attribute / alarm / script changes today (see TemplateEngine-018
|
||||
/// for the diff-shape extension that surfaces added / removed / changed
|
||||
/// connections in the UI).
|
||||
/// </summary>
|
||||
/// <param name="a">First connection configuration.</param>
|
||||
/// <param name="b">Second connection configuration.</param>
|
||||
/// <returns>True when both configurations are equal.</returns>
|
||||
public static bool ConnectionsEqual(ConnectionConfig a, ConnectionConfig b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
return a.Protocol == b.Protocol &&
|
||||
a.ConfigurationJson == b.ConfigurationJson &&
|
||||
a.BackupConfigurationJson == b.BackupConfigurationJson &&
|
||||
a.FailoverRetryCount == b.FailoverRetryCount;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ public class RevisionHashService
|
||||
CanonicalName = a.CanonicalName,
|
||||
Value = a.Value,
|
||||
DataType = a.DataType,
|
||||
Description = a.Description,
|
||||
IsLocked = a.IsLocked,
|
||||
DataSourceReference = a.DataSourceReference,
|
||||
BoundDataConnectionId = a.BoundDataConnectionId
|
||||
@@ -62,6 +63,7 @@ public class RevisionHashService
|
||||
.Select(a => new HashableAlarm
|
||||
{
|
||||
CanonicalName = a.CanonicalName,
|
||||
Description = a.Description,
|
||||
PriorityLevel = a.PriorityLevel,
|
||||
IsLocked = a.IsLocked,
|
||||
TriggerType = a.TriggerType,
|
||||
@@ -82,7 +84,20 @@ public class RevisionHashService
|
||||
ReturnDefinition = s.ReturnDefinition,
|
||||
MinTimeBetweenRunsTicks = s.MinTimeBetweenRuns?.Ticks
|
||||
})
|
||||
.ToList()
|
||||
.ToList(),
|
||||
Connections = configuration.Connections is { Count: > 0 }
|
||||
? new SortedDictionary<string, HashableConnection>(
|
||||
configuration.Connections.ToDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => new HashableConnection
|
||||
{
|
||||
BackupConfigurationJson = kvp.Value.BackupConfigurationJson,
|
||||
ConfigurationJson = kvp.Value.ConfigurationJson,
|
||||
FailoverRetryCount = kvp.Value.FailoverRetryCount,
|
||||
Protocol = kvp.Value.Protocol
|
||||
}),
|
||||
StringComparer.Ordinal)
|
||||
: null
|
||||
};
|
||||
|
||||
var json = JsonSerializer.Serialize(hashInput, CanonicalJsonOptions);
|
||||
@@ -108,6 +123,12 @@ public class RevisionHashService
|
||||
/// </summary>
|
||||
public List<HashableAttribute> Attributes { get; init; } = [];
|
||||
/// <summary>
|
||||
/// Data connection configurations keyed by connection name. Sorted by key
|
||||
/// (ordinal) to keep serialization deterministic. Null when the deployment
|
||||
/// package carries no connections.
|
||||
/// </summary>
|
||||
public SortedDictionary<string, HashableConnection>? Connections { get; init; }
|
||||
/// <summary>
|
||||
/// The unique instance name.
|
||||
/// </summary>
|
||||
public string InstanceUniqueName { get; init; } = string.Empty;
|
||||
@@ -144,6 +165,11 @@ public class RevisionHashService
|
||||
/// </summary>
|
||||
public string DataType { get; init; } = string.Empty;
|
||||
/// <summary>
|
||||
/// The attribute description (authoring-time documentation that still
|
||||
/// travels with the deployed payload).
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
/// <summary>
|
||||
/// Whether the attribute is locked.
|
||||
/// </summary>
|
||||
public bool IsLocked { get; init; }
|
||||
@@ -160,6 +186,11 @@ public class RevisionHashService
|
||||
/// </summary>
|
||||
public string CanonicalName { get; init; } = string.Empty;
|
||||
/// <summary>
|
||||
/// The alarm description (authoring-time documentation that still
|
||||
/// travels with the deployed payload).
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
/// <summary>
|
||||
/// Whether the alarm is locked.
|
||||
/// </summary>
|
||||
public bool IsLocked { get; init; }
|
||||
@@ -181,6 +212,26 @@ public class RevisionHashService
|
||||
public string TriggerType { get; init; } = string.Empty;
|
||||
}
|
||||
|
||||
private sealed record HashableConnection
|
||||
{
|
||||
/// <summary>
|
||||
/// Backup connection configuration JSON, if any.
|
||||
/// </summary>
|
||||
public string? BackupConfigurationJson { get; init; }
|
||||
/// <summary>
|
||||
/// Primary connection configuration JSON.
|
||||
/// </summary>
|
||||
public string? ConfigurationJson { get; init; }
|
||||
/// <summary>
|
||||
/// Number of failover retries before giving up.
|
||||
/// </summary>
|
||||
public int FailoverRetryCount { get; init; }
|
||||
/// <summary>
|
||||
/// Protocol name (e.g. "OpcUa").
|
||||
/// </summary>
|
||||
public string Protocol { get; init; } = string.Empty;
|
||||
}
|
||||
|
||||
private sealed record HashableScript
|
||||
{
|
||||
/// <summary>
|
||||
|
||||
Reference in New Issue
Block a user