From 291274ae762f6b70521abbf2aca734f4692f5e8e Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Thu, 28 May 2026 03:54:43 -0400 Subject: [PATCH] fix(notifications): close OAuth2 SMTP + dispatcher resilience gaps (5 findings) NS-021/NO-001: thread FromAddress into XOAUTH2 so M365 stops rejecting sends with 535 5.7.3. Added an additive oauth2UserName parameter on ISmtpClientWrapper.AuthenticateAsync; both NotificationService and NotificationOutbox now pass config.FromAddress. NO-002: clamp non-positive SmtpConfiguration.MaxRetries/RetryDelay to the 1-min / 10-attempt fallback with a Warning so a misconfigured row no longer parks transient failures on the first attempt or burn-loops. NO-003: route a lifecycle-scoped CancellationToken from the NotificationOutboxActor through the dispatch sweep into the adapter so in-flight SMTP sends abort on PostStop instead of blocking CoordinatedShutdown for the full SMTP timeout per row. NO-004: await the central audit writer inside the existing try/catch instead of fire-and-forget so the audit task can't outlive the per-sweep DI scope and writer faults reach the operator log instead of being silently dropped. Two AuditLog integration tests seeded RetryDelay = TimeSpan.Zero to force immediate re-claim on the second tick; updated them to 1 ms so they keep the same intent without tripping the NO-002 clamp. --- code-reviews/NotificationOutbox/findings.md | 8 +- code-reviews/NotificationService/findings.md | 2 +- code-reviews/README.md | 19 +-- .../EmailNotificationDeliveryAdapter.cs | 8 +- .../NotificationOutboxActor.cs | 154 +++++++++++++++--- .../ISmtpClientWrapper.cs | 13 +- .../MailKitSmtpClientWrapper.cs | 23 ++- .../NotificationDeliveryService.cs | 8 +- .../AuditWriteFailureSafetyTests.cs | 4 +- .../NotifyDispatcherAuditTrailTests.cs | 11 +- .../NotificationOutboxActorDispatchTests.cs | 125 ++++++++++++++ .../MailKitSmtpClientWrapperTests.cs | 36 ++++ .../NotificationDeliveryServiceTests.cs | 20 ++- 13 files changed, 370 insertions(+), 61 deletions(-) diff --git a/code-reviews/NotificationOutbox/findings.md b/code-reviews/NotificationOutbox/findings.md index 5fb6c48d..75e2951a 100644 --- a/code-reviews/NotificationOutbox/findings.md +++ b/code-reviews/NotificationOutbox/findings.md @@ -62,7 +62,7 @@ from `Component-NotificationOutbox.md`. No Critical findings; two High, six Medi |--|--| | Severity | High | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs:185-191` (calls `smtp.AuthenticateAsync("oauth2", token)`); root cause in `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:76-79` | **Description** @@ -105,7 +105,7 @@ _Unresolved._ |--|--| | Severity | High | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs:348-360` | **Description** @@ -145,7 +145,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs:334`, `src/ScadaLink.NotificationOutbox/Delivery/INotificationDeliveryAdapter.cs:22` | **Description** @@ -185,7 +185,7 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs:425-435`, `463-485` | **Description** diff --git a/code-reviews/NotificationService/findings.md b/code-reviews/NotificationService/findings.md index 407e7e30..3611ed93 100644 --- a/code-reviews/NotificationService/findings.md +++ b/code-reviews/NotificationService/findings.md @@ -702,7 +702,7 @@ Mark the NS-001 resolution note in this file as **superseded by NS-019** with a |--|--| | Severity | High | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs:76-79` | **Description** diff --git a/code-reviews/README.md b/code-reviews/README.md index 60db5aa5..b8a3989c 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -40,10 +40,10 @@ module file and counted in **Total**. | Severity | Open findings | |----------|---------------| | Critical | 0 | -| High | 16 | -| Medium | 58 | +| High | 13 | +| Medium | 56 | | Low | 90 | -| **Total** | **164** | +| **Total** | **159** | ## Module Status @@ -63,8 +63,8 @@ module file and counted in **Total**. | [Host](Host/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/5 | 7 | 22 | | [InboundAPI](InboundAPI/findings.md) | 2026-05-28 | `1eb6e97` | 0/1/3/4 | 8 | 25 | | [ManagementService](ManagementService/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/2 | 4 | 23 | -| [NotificationOutbox](NotificationOutbox/findings.md) | 2026-05-28 | `1eb6e97` | 0/2/5/3 | 10 | 10 | -| [NotificationService](NotificationService/findings.md) | 2026-05-28 | `1eb6e97` | 0/2/2/3 | 7 | 25 | +| [NotificationOutbox](NotificationOutbox/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/3/3 | 6 | 10 | +| [NotificationService](NotificationService/findings.md) | 2026-05-28 | `1eb6e97` | 0/1/2/3 | 6 | 25 | | [Security](Security/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/0/2 | 2 | 21 | | [SiteCallAudit](SiteCallAudit/findings.md) | 2026-05-28 | `1eb6e97` | 0/0/2/4 | 6 | 6 | | [SiteEventLogging](SiteEventLogging/findings.md) | 2026-05-28 | `1eb6e97` | 0/1/2/6 | 9 | 23 | @@ -84,7 +84,7 @@ description, location, recommendation — lives in the module's `findings.md`. _None open._ -### High (16) +### High (13) | ID | Module | Title | |----|--------|-------| @@ -94,10 +94,7 @@ _None open._ | DeploymentManager-018 | [DeploymentManager](DeploymentManager/findings.md) | Reconciliation force-sets `Enabled`, overwriting an intentional `Disabled` after central failover | | ExternalSystemGateway-018 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `DeliverBufferedAsync` lets `JsonException` propagate, turning a corrupt buffered row into a permanent retry-forever poison message | | InboundAPI-022 | [InboundAPI](InboundAPI/findings.md) | `IActiveNodeGate` has no production registration in Host — standby-node gating is silently disabled in production | -| NotificationOutbox-001 | [NotificationOutbox](NotificationOutbox/findings.md) | `EmailNotificationDeliveryAdapter` inherits the OAuth2 empty-user SASL bug (NS-021) on the M365 send path | -| NotificationOutbox-002 | [NotificationOutbox](NotificationOutbox/findings.md) | Dispatcher parks on first transient failure when `SmtpConfiguration.MaxRetries == 0` | | NotificationService-019 | [NotificationService](NotificationService/findings.md) | `NotificationDeliveryService` and `INotificationDeliveryService` are orphaned by the central-only redesign | -| NotificationService-021 | [NotificationService](NotificationService/findings.md) | OAuth2 SASL constructed with empty user identifier; M365 SMTP will reject the auth handshake | | SiteEventLogging-016 | [SiteEventLogging](SiteEventLogging/findings.md) | `From`/`To` filters compare non-normalised ISO 8601 strings against UTC-stored timestamps | | StoreAndForward-018 | [StoreAndForward](StoreAndForward/findings.md) | Notification corrupt-payload parks the buffered message, contradicting the "notifications do not park" design invariant | | TemplateEngine-017 | [TemplateEngine](TemplateEngine/findings.md) | Revision hash and diff both ignore `Description` and `Connections`, defeating staleness detection for real deployment changes | @@ -105,7 +102,7 @@ _None open._ | Transport-002 | [Transport](Transport/findings.md) | ExternalSystem Overwrite never syncs methods | | Transport-003 | [Transport](Transport/findings.md) | Unlock lockout is enforced only client-side; server session is never marked Locked | -### Medium (58) +### Medium (56) | ID | Module | Title | |----|--------|-------| @@ -140,8 +137,6 @@ _None open._ | InboundAPI-025 | [InboundAPI](InboundAPI/findings.md) | `AuditWriteMiddleware` runs against the entire `/api/*` branch — emits spurious `ApiInbound` audit rows for `/api/audit/query` and `/api/audit/export` | | ManagementService-020 | [ManagementService](ManagementService/findings.md) | UpdateSmtpConfig returns and audits the SMTP Credentials field verbatim | | ManagementService-021 | [ManagementService](ManagementService/findings.md) | Transport bundle handlers have zero test coverage | -| NotificationOutbox-003 | [NotificationOutbox](NotificationOutbox/findings.md) | Dispatcher does not propagate a `CancellationToken` into delivery; in-flight SMTP sends cannot be cancelled on shutdown | -| NotificationOutbox-004 | [NotificationOutbox](NotificationOutbox/findings.md) | `EmitAttemptAudit`/`EmitTerminalAudit` fire-and-forget pattern can outlive the per-sweep DI scope | | NotificationOutbox-005 | [NotificationOutbox](NotificationOutbox/findings.md) | Ingest persistence inherits the CD-015 check-then-act race; under contention the second writer throws and the site retries | | NotificationOutbox-007 | [NotificationOutbox](NotificationOutbox/findings.md) | `NotificationOutboxOptions.DispatchBatchSize`, `DeliveredKpiWindow`, and `PurgeInterval` are not in the design document | | NotificationOutbox-010 | [NotificationOutbox](NotificationOutbox/findings.md) | Comment claims `PipeTo` is not used "because the writer never throws"; the surrounding try/catch is dead-letter for the documented failure mode | diff --git a/src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs b/src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs index 7b359a99..ef351b76 100644 --- a/src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs +++ b/src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs @@ -188,7 +188,13 @@ public sealed class EmailNotificationDeliveryAdapter : INotificationDeliveryAdap credentials = await _tokenService.GetTokenAsync(credentials, cancellationToken); } - await smtp.AuthenticateAsync(config.AuthType, credentials, cancellationToken); + // NO-001/NS-021: OAuth2 XOAUTH2 requires the user identity (FromAddress) + // to be sent alongside the access token; an empty user is rejected by M365. + await smtp.AuthenticateAsync( + config.AuthType, + credentials, + oauth2UserName: config.FromAddress, + cancellationToken: cancellationToken); await smtp.SendAsync(config.FromAddress, bccAddresses, subject, body, cancellationToken); } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) diff --git a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs index 0ffbcb87..be2429c5 100644 --- a/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs +++ b/src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs @@ -49,6 +49,14 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers /// private bool _dispatching; + /// + /// NO-003: lifecycle-scoped cancellation source, cancelled in so + /// any in-flight dispatch sweep — including a long-running SMTP send via the channel + /// adapter — observes shutdown promptly instead of blocking CoordinatedShutdown + /// for the full SMTP connect/auth/send timeout per in-progress notification. + /// + private CancellationTokenSource? _shutdownCts; + /// Akka timer scheduler, assigned by the actor system via . public ITimerScheduler Timers { get; set; } = null!; @@ -91,12 +99,35 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers protected override void PreStart() { base.PreStart(); + // NO-003: shutdown token is alive for the lifetime of the actor; cancelled in PostStop + // so dispatcher sweeps and the SMTP send beneath them observe coordinated shutdown. + _shutdownCts = new CancellationTokenSource(); Timers.StartPeriodicTimer( DispatchTimerKey, InternalMessages.DispatchTick.Instance, _options.DispatchInterval); Timers.StartPeriodicTimer( PurgeTimerKey, InternalMessages.PurgeTick.Instance, _options.PurgeInterval); } + /// + protected override void PostStop() + { + // NO-003: cancel the shutdown token first so the in-flight sweep's adapter call + // observes cancellation, then dispose the source. Order matters — disposing first + // would race with an in-flight sweep registering with the token. + try + { + _shutdownCts?.Cancel(); + } + catch (ObjectDisposedException) + { + // Already disposed under a restarted-actor race; nothing to do. + } + + _shutdownCts?.Dispose(); + _shutdownCts = null; + base.PostStop(); + } + /// /// Maps an inbound onto a , /// persists it idempotently, and pipes the outcome back to so the @@ -167,11 +198,15 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers _dispatching = true; var now = DateTimeOffset.UtcNow; + // NO-003: hand the lifecycle token to the sweep so cancellation reaches the + // adapter. A null token (very early start / post-stop race) is replaced with + // None — no behaviour change vs. the pre-NO-003 dispatcher. + var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None; // RunDispatchPass swallows its own errors, but the failure projection is kept as a // belt-and-braces guard so even a faulted task still lowers the in-flight guard — // otherwise the dispatcher would wedge permanently. - RunDispatchPass(now).PipeTo( + RunDispatchPass(now, cancellationToken).PipeTo( Self, success: () => InternalMessages.DispatchComplete.Instance, failure: ex => @@ -194,7 +229,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers /// directly, so a long-lived adapter reference on /// this singleton actor would be a captive dependency over a disposed DbContext. /// - private async Task RunDispatchPass(DateTimeOffset now) + private async Task RunDispatchPass(DateTimeOffset now, CancellationToken cancellationToken) { try { @@ -206,7 +241,13 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers IReadOnlyList due; try { - due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize); + due = await outboxRepository.GetDueAsync(now, _options.DispatchBatchSize, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // NO-003: shutdown cancelled the claim; row stays Pending and the next active + // node picks it up. Not a failure. + return; } catch (Exception ex) { @@ -223,9 +264,23 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers foreach (var notification in due) { + // NO-003: between deliveries, observe shutdown so we don't kick off a fresh + // SMTP send when the actor is already tearing down. + if (cancellationToken.IsCancellationRequested) + { + return; + } + try { - await DeliverOneAsync(notification, now, maxRetries, retryDelay, outboxRepository, adapters); + await DeliverOneAsync( + notification, now, maxRetries, retryDelay, outboxRepository, adapters, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // NO-003: in-flight delivery interrupted by shutdown. Row remains in its + // pre-attempt state; next active sweep retries. + return; } catch (Exception ex) { @@ -248,14 +303,48 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers /// configuration exists, falls back to a conservative default — delivery itself will /// permanently fail in that case, so the policy only acts as a guard. /// + /// + /// NO-002: a non-positive (zero or negative) + /// would otherwise satisfy RetryCount >= maxRetries on the very first transient + /// failure and park the row without a single retry — silently halving the outbox's + /// delivery guarantees. The same applies to a non-positive RetryDelay, which + /// would burn-loop the dispatcher. Both values are clamped to the fallback constants + /// with a Warning so an operator can spot the misconfiguration in logs. + /// private async Task<(int MaxRetries, TimeSpan RetryDelay)> ResolveRetryPolicyAsync( INotificationRepository notificationRepository) { var configurations = await notificationRepository.GetAllSmtpConfigurationsAsync(); var configuration = configurations.Count > 0 ? configurations[0] : null; - return configuration is null - ? (FallbackMaxRetries, FallbackRetryDelay) - : (configuration.MaxRetries, configuration.RetryDelay); + if (configuration is null) + { + return (FallbackMaxRetries, FallbackRetryDelay); + } + + var maxRetries = configuration.MaxRetries; + var retryDelay = configuration.RetryDelay; + + if (maxRetries <= 0) + { + _logger.LogWarning( + "SmtpConfiguration.MaxRetries={ConfiguredMaxRetries} is non-positive; " + + "clamping to FallbackMaxRetries={FallbackMaxRetries} so transient failures " + + "actually retry before parking.", + maxRetries, FallbackMaxRetries); + maxRetries = FallbackMaxRetries; + } + + if (retryDelay <= TimeSpan.Zero) + { + _logger.LogWarning( + "SmtpConfiguration.RetryDelay={ConfiguredRetryDelay} is non-positive; " + + "clamping to FallbackRetryDelay={FallbackRetryDelay} so the dispatcher does " + + "not burn-loop on transient failures.", + retryDelay, FallbackRetryDelay); + retryDelay = FallbackRetryDelay; + } + + return (maxRetries, retryDelay); } /// @@ -307,7 +396,8 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers int maxRetries, TimeSpan retryDelay, INotificationOutboxRepository outboxRepository, - IReadOnlyDictionary adapters) + IReadOnlyDictionary adapters, + CancellationToken cancellationToken) { if (!adapters.TryGetValue(notification.Type, out var adapter)) { @@ -318,20 +408,22 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers notification.Status = NotificationStatus.Parked; notification.LastError = missingAdapterError; notification.LastAttemptAt = now; - await outboxRepository.UpdateAsync(notification); - EmitAttemptAudit( + await outboxRepository.UpdateAsync(notification, cancellationToken); + await EmitAttemptAuditAsync( notification, now, durationMs: 0, errorMessage: missingAdapterError); - EmitTerminalAudit(notification, now, errorMessage: missingAdapterError); + await EmitTerminalAuditAsync(notification, now, errorMessage: missingAdapterError); return; } // Measure the attempt duration around the adapter call so the // Attempted row carries it for KPI use. + // NO-003: pass the lifecycle token so a coordinated shutdown promptly cancels the + // in-flight SMTP send instead of waiting for the SMTP connect/auth/send timeout. var attemptStart = DateTimeOffset.UtcNow; - var outcome = await adapter.DeliverAsync(notification); + var outcome = await adapter.DeliverAsync(notification, cancellationToken); var durationMs = (int)Math.Min( int.MaxValue, Math.Max(0, (DateTimeOffset.UtcNow - attemptStart).TotalMilliseconds)); @@ -367,13 +459,13 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers break; } - await outboxRepository.UpdateAsync(notification); + await outboxRepository.UpdateAsync(notification, cancellationToken); // Emit the per-attempt Attempted row exactly once regardless of the // outcome (B2). The error message comes from the outcome, not from // notification.LastError, so a success row is null and a transient // row carries the SMTP failure reason verbatim. - EmitAttemptAudit( + await EmitAttemptAuditAsync( notification, now, durationMs: durationMs, @@ -386,7 +478,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers // reason so downstream consumers can link Attempted+Parked rows. if (IsTerminal(notification.Status)) { - EmitTerminalAudit( + await EmitTerminalAuditAsync( notification, now, errorMessage: outcome.Result == DeliveryResult.Success ? null : outcome.Error); @@ -412,9 +504,15 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers /// / /// audit row carrying the terminal status (Delivered, Parked, or /// Discarded) of . Wrapped in try/catch - /// for the same defensive reason as . + /// for the same defensive reason as . /// - private void EmitTerminalAudit( + /// + /// NO-004: is awaited inside the + /// try/catch so the catch is actually reachable for writer faults and so the + /// audit task does not outlive the per-sweep DI scope. The audit-write-never- + /// affects-delivery invariant is preserved by the surrounding catch. + /// + private async Task EmitTerminalAuditAsync( Notification notification, DateTimeOffset now, string? errorMessage) @@ -423,7 +521,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers { var terminalStatus = MapNotificationStatusToAuditStatus(notification.Status); var evt = BuildNotifyDeliverEvent(notification, now, terminalStatus, errorMessage); - _ = _auditWriter.WriteAsync(evt); + await _auditWriter.WriteAsync(evt); } catch (Exception ex) { @@ -457,10 +555,17 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers /// / /// audit row with . Wrapped in /// try/catch so an audit-write failure never propagates back into the - /// dispatcher loop — the already - /// swallows, this is defensive (alog.md §13). + /// dispatcher loop (alog.md §13). /// - private void EmitAttemptAudit( + /// + /// NO-004: previously the writer task was discarded (_ = WriteAsync(...)), + /// which made the surrounding catch unreachable for any fault originating in the + /// awaited body of WriteAsync and let the audit task outlive the dispatcher's + /// per-sweep DI scope. The task is now awaited inside the try/catch: the + /// audit-write-never-affects-delivery invariant is preserved by the catch, and + /// writer faults reach the operator log instead of being silently lost. + /// + private async Task EmitAttemptAuditAsync( Notification notification, DateTimeOffset now, int durationMs, @@ -470,10 +575,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers { var evt = BuildNotifyDeliverEvent(notification, now, AuditStatus.Attempted, errorMessage) with { DurationMs = durationMs }; - // Fire-and-forget — we do NOT await: the dispatcher loop must not - // be blocked by audit IO, and the writer swallows its own faults. - // PipeTo is not used because the writer never throws. - _ = _auditWriter.WriteAsync(evt); + await _auditWriter.WriteAsync(evt); } catch (Exception ex) { @@ -845,7 +947,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers // Emit a Discarded NotifyDeliver row to match the dispatcher's // Delivered/Parked emissions; the row carries no error message because // the discard is an operator-driven cancellation, not a delivery error. - EmitTerminalAudit(notification, DateTimeOffset.UtcNow, errorMessage: null); + await EmitTerminalAuditAsync(notification, DateTimeOffset.UtcNow, errorMessage: null); return new DiscardNotificationResponse(request.CorrelationId, Success: true, ErrorMessage: null); } diff --git a/src/ScadaLink.NotificationService/ISmtpClientWrapper.cs b/src/ScadaLink.NotificationService/ISmtpClientWrapper.cs index acf0d3c0..14cff028 100644 --- a/src/ScadaLink.NotificationService/ISmtpClientWrapper.cs +++ b/src/ScadaLink.NotificationService/ISmtpClientWrapper.cs @@ -29,8 +29,19 @@ public interface ISmtpClientWrapper /// Authenticates to the SMTP server using the specified auth type and credentials. /// Authentication mechanism (e.g. PLAIN, XOAUTH2). /// Credential string appropriate for the auth type, or null. + /// + /// NS-021: mailbox identity the OAuth2 access token was issued for (typically + /// the SMTP FromAddress). Used as the user= field of the XOAUTH2 + /// SASL initial response — M365 rejects an empty/mismatched user with + /// 535 5.7.3. Ignored for non-OAuth2 auth types; default null for + /// callers that do not authenticate with OAuth2. + /// /// Cancellation token. - Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default); + Task AuthenticateAsync( + string authType, + string? credentials, + string? oauth2UserName = null, + CancellationToken cancellationToken = default); /// Sends an email message with the specified recipients via BCC. /// Sender address. /// Recipients delivered as BCC. diff --git a/src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs b/src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs index eaf1e3c3..74aa782b 100644 --- a/src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs +++ b/src/ScadaLink.NotificationService/MailKitSmtpClientWrapper.cs @@ -45,7 +45,11 @@ public class MailKitSmtpClientWrapper : ISmtpClientWrapper, IDisposable } /// - public async Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public async Task AuthenticateAsync( + string authType, + string? credentials, + string? oauth2UserName = null, + CancellationToken cancellationToken = default) { // NS-016: missing/unparseable credentials and an unrecognised auth type used // to make this method silently return and the connection then sent mail @@ -74,8 +78,21 @@ public class MailKitSmtpClientWrapper : ISmtpClientWrapper, IDisposable break; case "oauth2": - // OAuth2 token is passed directly as credentials (pre-fetched by token service) - var oauth2 = new SaslMechanismOAuth2("", credentials); + // NS-021: the XOAUTH2 SASL initial response embeds a `user=` + // field that M365 (and most OAuth2-enabled SMTP relays) require to + // match the mailbox identity the token was issued for. An empty user + // gets rejected with `535 5.7.3`. The token (credentials) is + // pre-fetched by OAuth2TokenService; the user identity is the SMTP + // From address, threaded through `oauth2UserName`. + if (string.IsNullOrEmpty(oauth2UserName)) + { + throw new SmtpPermanentException( + "OAuth2 SMTP auth requires a non-empty user identity " + + "(mailbox the access token was issued for); " + + "the caller did not pass an oauth2UserName."); + } + + var oauth2 = new SaslMechanismOAuth2(oauth2UserName, credentials); await _client.AuthenticateAsync(oauth2, cancellationToken); break; diff --git a/src/ScadaLink.NotificationService/NotificationDeliveryService.cs b/src/ScadaLink.NotificationService/NotificationDeliveryService.cs index 3d177f0a..f11f881f 100644 --- a/src/ScadaLink.NotificationService/NotificationDeliveryService.cs +++ b/src/ScadaLink.NotificationService/NotificationDeliveryService.cs @@ -397,7 +397,13 @@ public class NotificationDeliveryService : INotificationDeliveryService, IDispos credentials = token; } - await smtp.AuthenticateAsync(config.AuthType, credentials, cancellationToken); + // NS-021: OAuth2 XOAUTH2 requires the user identity (FromAddress) to be + // sent alongside the access token; an empty user is rejected by M365. + await smtp.AuthenticateAsync( + config.AuthType, + credentials, + oauth2UserName: config.FromAddress, + cancellationToken: cancellationToken); var bccAddresses = recipients.Select(r => r.EmailAddress).ToList(); await smtp.SendAsync(config.FromAddress, bccAddresses, subject, body, cancellationToken); diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/AuditWriteFailureSafetyTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/AuditWriteFailureSafetyTests.cs index 9399fb78..7f53300d 100644 --- a/tests/ScadaLink.AuditLog.Tests/Integration/AuditWriteFailureSafetyTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Integration/AuditWriteFailureSafetyTests.cs @@ -365,11 +365,13 @@ public class AuditWriteFailureSafetyTests : TestKit, IClassFixture /// Inserts a single SMTP configuration row so the dispatcher's /// ResolveRetryPolicyAsync sees a real (maxRetries, retryDelay) - /// pair rather than the conservative fallback. RetryDelay of 0 means a - /// transient outcome's NextAttemptAt is immediately due — useful so - /// the SECOND DispatchTick re-claims the row without waiting. + /// pair rather than the conservative fallback. A tiny positive RetryDelay + /// means a transient outcome's NextAttemptAt is immediately due — + /// useful so the SECOND DispatchTick re-claims the row without waiting. + /// NO-002: the dispatcher now clamps a non-positive RetryDelay to the + /// 1-minute fallback to avoid burn-looping on transient failures, so this + /// must be a strictly positive value (1 ms is fine for tests). /// private async Task SeedSmtpConfigAsync(int maxRetries = 5) { @@ -141,7 +144,7 @@ public class NotifyDispatcherAuditTrailTests : TestKit, IClassFixture(), Arg.Any(), Arg.Any())); } + [Fact] + public void TransientFailure_WithZeroMaxRetries_RetriesUsingFallback_DoesNotParkImmediately() + { + // NO-002: SmtpConfiguration.MaxRetries=0 used to satisfy 1 >= 0 on the very first + // transient failure and park the row without a single retry. ResolveRetryPolicyAsync + // now clamps non-positive MaxRetries to the FallbackMaxRetries (10) so transient + // failures actually retry before parking. + SetupSmtpRetryPolicy(maxRetries: 0, retryDelay: TimeSpan.FromMinutes(1)); + var notification = MakeNotification(retryCount: 0); + _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(new[] { notification }); + var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout")); + var actor = CreateActor([adapter]); + + actor.Tell(InternalMessages.DispatchTick.Instance); + + AwaitAssert(() => + { + _outboxRepository.Received(1).UpdateAsync( + Arg.Is(n => + n.Status == NotificationStatus.Retrying && + n.RetryCount == 1 && + n.NextAttemptAt != null && + n.LastError == "smtp timeout"), + Arg.Any()); + }); + } + + [Fact] + public void TransientFailure_WithNegativeMaxRetries_RetriesUsingFallback_DoesNotParkImmediately() + { + // NO-002: a negative MaxRetries reaches ResolveRetryPolicyAsync just as -1 — same + // park-immediately bug. Clamp to FallbackMaxRetries. + SetupSmtpRetryPolicy(maxRetries: -1, retryDelay: TimeSpan.FromMinutes(1)); + var notification = MakeNotification(retryCount: 0); + _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(new[] { notification }); + var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout")); + var actor = CreateActor([adapter]); + + actor.Tell(InternalMessages.DispatchTick.Instance); + + AwaitAssert(() => + { + _outboxRepository.Received(1).UpdateAsync( + Arg.Is(n => + n.Status == NotificationStatus.Retrying && + n.RetryCount == 1 && + n.NextAttemptAt != null && + n.LastError == "smtp timeout"), + Arg.Any()); + }); + } + + [Fact] + public void TransientFailure_WithNonPositiveRetryDelay_UsesFallbackDelay_NotZero() + { + // NO-002: a non-positive RetryDelay would burn-loop the dispatcher because + // NextAttemptAt would equal now. Clamp to FallbackRetryDelay (1 min) so the + // schedule actually advances. + SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.Zero); + var before = DateTimeOffset.UtcNow; + var notification = MakeNotification(retryCount: 0); + _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(new[] { notification }); + var adapter = new StubAdapter(() => DeliveryOutcome.Transient("smtp timeout")); + var actor = CreateActor([adapter]); + + actor.Tell(InternalMessages.DispatchTick.Instance); + + AwaitAssert(() => + { + _outboxRepository.Received(1).UpdateAsync( + Arg.Is(n => + n.Status == NotificationStatus.Retrying && + n.NextAttemptAt != null && + n.NextAttemptAt > before + TimeSpan.FromSeconds(30)), + Arg.Any()); + }); + } + + [Fact] + public void PostStop_CancelsInFlightDelivery_LeavesRowNonTerminal() + { + // NO-003: the dispatcher used to drop the CancellationToken on its way into + // the channel adapter, so a coordinated shutdown had to wait the full SMTP + // connect/auth/send timeout per in-flight notification before the sweep + // finished. The actor now passes a lifecycle-scoped token; cancelling it on + // PostStop must abort the in-flight Task.Delay (standing in for an SMTP + // send) and the row must NOT be updated to a terminal state — the next + // active node picks it back up. + SetupSmtpRetryPolicy(maxRetries: 5, retryDelay: TimeSpan.FromMinutes(1)); + var notification = MakeNotification(); + _outboxRepository.GetDueAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(new[] { notification }); + // Long delay simulates a slow SMTP send; the test triggers PostStop before + // the delay would naturally elapse, so the only way the delay completes is + // if the token wired through. + var adapter = new StubAdapter( + () => DeliveryOutcome.Success("ops@example.com"), + delay: TimeSpan.FromSeconds(30)); + var actor = CreateActor([adapter]); + + actor.Tell(InternalMessages.DispatchTick.Instance); + // Wait until the adapter is actually in flight before stopping. + AwaitAssert(() => Assert.Equal(1, adapter.CallCount)); + + var start = DateTimeOffset.UtcNow; + Sys.Stop(actor); + + // The sweep should observe cancellation promptly (well under the 30s delay). + AwaitAssert( + () => + { + // No UpdateAsync was issued — the row is untouched and will be re-claimed + // by the next active node. + _outboxRepository.DidNotReceive().UpdateAsync( + Arg.Any(), Arg.Any()); + }, + duration: TimeSpan.FromSeconds(5)); + + Assert.True(DateTimeOffset.UtcNow - start < TimeSpan.FromSeconds(5), + "PostStop did not cancel the in-flight delivery promptly."); + } + [Fact] public void OverlappingTicks_WhileDispatchInFlight_DoNotClaimConcurrently() { diff --git a/tests/ScadaLink.NotificationService.Tests/MailKitSmtpClientWrapperTests.cs b/tests/ScadaLink.NotificationService.Tests/MailKitSmtpClientWrapperTests.cs index 0ee92084..df6e3a1a 100644 --- a/tests/ScadaLink.NotificationService.Tests/MailKitSmtpClientWrapperTests.cs +++ b/tests/ScadaLink.NotificationService.Tests/MailKitSmtpClientWrapperTests.cs @@ -1,3 +1,6 @@ +using System.Text; +using MailKit.Security; + namespace ScadaLink.NotificationService.Tests; /// @@ -5,6 +8,8 @@ namespace ScadaLink.NotificationService.Tests; /// silently skip authentication for a misconfigured SMTP config — a missing /// credential, an unrecognised auth type, or an unparseable Basic credential /// must be a hard, surfaced error rather than an unauthenticated send. +/// NS-021: the OAuth2 (XOAUTH2) branch must carry a non-empty user identity +/// (the SMTP From address) — an empty user is rejected by M365 with `535 5.7.3`. /// public class MailKitSmtpClientWrapperTests { @@ -42,4 +47,35 @@ public class MailKitSmtpClientWrapperTests await Assert.ThrowsAsync( () => wrapper.AuthenticateAsync("basic", "nocolon")); } + + [Fact] + public async Task Authenticate_OAuth2WithoutUserName_Throws() + { + // NS-021: passing an OAuth2 access token but no user identity (FromAddress) + // used to construct `new SaslMechanismOAuth2("", credentials)`, which M365 + // rejects with `535 5.7.3`. The wrapper now refuses upfront so the caller + // sees a clean configuration error rather than a confusing server reject. + var wrapper = new MailKitSmtpClientWrapper(); + + await Assert.ThrowsAsync( + () => wrapper.AuthenticateAsync("oauth2", "access-token", oauth2UserName: null)); + await Assert.ThrowsAsync( + () => wrapper.AuthenticateAsync("oauth2", "access-token", oauth2UserName: "")); + } + + [Fact] + public void XOAuth2InitialResponse_CarriesUserAndBearer() + { + // NS-021 regression guard: independent of the wrapper, prove that MailKit's + // SaslMechanismOAuth2 puts `user=` into the initial-response bytes + // — i.e. wiring the wrapper to pass `FromAddress` is sufficient to fix the + // M365 handshake. If MailKit ever changes the framing this test will catch it. + var sasl = new SaslMechanismOAuth2("noreply@example.com", "tok-xyz"); + + var initial = sasl.Challenge(string.Empty); + var asString = Encoding.UTF8.GetString(Convert.FromBase64String(initial)); + + Assert.Contains("user=noreply@example.com", asString); + Assert.Contains("auth=Bearer tok-xyz", asString); + } } diff --git a/tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs b/tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs index 7312089c..8a743629 100644 --- a/tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs +++ b/tests/ScadaLink.NotificationService.Tests/NotificationDeliveryServiceTests.cs @@ -115,7 +115,8 @@ public class NotificationDeliveryServiceTests await _smtpClient.Received().ConnectAsync( "smtp.example.com", 587, SmtpTlsMode.StartTls, Arg.Any(), Arg.Any()); - await _smtpClient.Received().AuthenticateAsync("basic", "user:pass", Arg.Any()); + await _smtpClient.Received().AuthenticateAsync( + "basic", "user:pass", Arg.Any(), Arg.Any()); await _smtpClient.Received().SendAsync( "noreply@example.com", Arg.Is>(bcc => bcc.Count() == 2), @@ -370,7 +371,7 @@ public class NotificationDeliveryServiceTests public bool Disposed { get; private set; } public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default) => Task.CompletedTask; - public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default) => Task.CompletedTask; public Task SendAsync(string from, IEnumerable bccRecipients, string subject, string body, CancellationToken cancellationToken = default) => Task.CompletedTask; @@ -435,7 +436,7 @@ public class NotificationDeliveryServiceTests public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default) => Task.CompletedTask; - public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default) => _failOnAuthenticate != null ? Task.FromException(_failOnAuthenticate()) : Task.CompletedTask; public Task SendAsync(string from, IEnumerable bccRecipients, string subject, string body, CancellationToken cancellationToken = default) @@ -496,7 +497,7 @@ public class NotificationDeliveryServiceTests ConnectionTimeoutSeconds = connectionTimeoutSeconds; return Task.CompletedTask; } - public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default) => Task.CompletedTask; public Task SendAsync(string from, IEnumerable bccRecipients, string subject, string body, CancellationToken cancellationToken = default) => Task.CompletedTask; @@ -643,7 +644,7 @@ public class NotificationDeliveryServiceTests public BlockingSmtpClient(Func onSend) => _onSend = onSend; public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default) => Task.CompletedTask; - public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default) => Task.CompletedTask; public Task SendAsync(string from, IEnumerable bccRecipients, string subject, string body, CancellationToken cancellationToken = default) => _onSend(); @@ -721,17 +722,19 @@ public class NotificationDeliveryServiceTests // ── NotificationService-012: OAuth2 delivery path coverage ── - /// An SMTP wrapper that records the auth type and credentials it received. + /// An SMTP wrapper that records the auth type, credentials, and OAuth2 user identity it received. private sealed class RecordingAuthClient : ISmtpClientWrapper { public string? AuthType { get; private set; } public string? Credentials { get; private set; } + public string? OAuth2UserName { get; private set; } public Task ConnectAsync(string host, int port, SmtpTlsMode tlsMode, int connectionTimeoutSeconds, CancellationToken cancellationToken = default) => Task.CompletedTask; - public Task AuthenticateAsync(string authType, string? credentials, CancellationToken cancellationToken = default) + public Task AuthenticateAsync(string authType, string? credentials, string? oauth2UserName = null, CancellationToken cancellationToken = default) { AuthType = authType; Credentials = credentials; + OAuth2UserName = oauth2UserName; return Task.CompletedTask; } public Task SendAsync(string from, IEnumerable bccRecipients, string subject, string body, CancellationToken cancellationToken = default) @@ -790,6 +793,9 @@ public class NotificationDeliveryServiceTests Assert.True(result.Success); Assert.Equal("oauth2", recording.AuthType); Assert.Equal("oauth2-access-token-xyz", recording.Credentials); + // NS-021: OAuth2 SASL must carry the FromAddress as the user identity so + // the M365 XOAUTH2 handshake's `user=` field matches the token's mailbox. + Assert.Equal("noreply@example.com", recording.OAuth2UserName); } // ── NotificationService-015: unclassified exceptions must not escape SendAsync ──