fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR

CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL
IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the
at-least-once livelock on the site→central notification handoff.

DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent
same-tag subscribes don't orphan an adapter handle; delete the latent
dead _subscriptionHandles dictionary; stop double-counting
_totalSubscribed when an unresolved tag is promoted via another instance;
release adapter handles on mid-flight unsubscribe; gate the
tag-resolution retry timer with IsTimerActive so subscribe bursts don't
reset it into starvation.

SR-020: add _terminatingActorsByName shadow so a third deploy arriving
during a pending redeploy doesn't crash on InvalidActorNameException —
displaced senders get a Failed/superseded response and the latest
command wins on Terminated.

SR-024: split OperationTrackingStore reads from writes (fresh
SqliteConnection per GetStatusAsync) so long writes don't block status
queries; rewrite Dispose to drop the sync-over-async bridge that could
deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the
dispose-once flag race-safe across both paths.
This commit is contained in:
Joseph Doherty
2026-05-28 05:20:13 -04:00
parent 5d2386cc9d
commit f936f55f51
15 changed files with 1152 additions and 170 deletions
@@ -1,4 +1,7 @@
using Microsoft.Data.SqlClient;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ScadaLink.Commons.Entities.Notifications;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Types.Enums;
@@ -12,7 +15,20 @@ namespace ScadaLink.ConfigurationDatabase.Repositories;
/// </summary>
public class NotificationOutboxRepository : INotificationOutboxRepository
{
// SQL Server duplicate-key error numbers, matching the AuditLogRepository
// and SiteCallAuditRepository race-fixes. 2601 is a unique-index violation;
// 2627 is a primary-key/unique-constraint violation. The IF NOT EXISTS …
// INSERT pattern has a check-then-act race window — two sessions can both
// pass the EXISTS check and then both attempt the INSERT — and the loser
// surfaces as one of these. The site→central handoff is documented
// at-least-once with insert-if-not-exists, so the collision IS the expected
// contention mode; idempotency demands we swallow them rather than let the
// site retry the same NotificationId forever.
private const int SqlErrorUniqueIndexViolation = 2601;
private const int SqlErrorPrimaryKeyViolation = 2627;
private readonly ScadaLinkDbContext _context;
private readonly ILogger<NotificationOutboxRepository> _logger;
// Statuses that represent a finished notification lifecycle. Non-terminal is the complement.
private static readonly NotificationStatus[] TerminalStatuses =
@@ -24,24 +40,67 @@ public class NotificationOutboxRepository : INotificationOutboxRepository
/// <summary>Initializes a new instance of <see cref="NotificationOutboxRepository"/> with the given EF Core context.</summary>
/// <param name="context">The EF Core database context.</param>
public NotificationOutboxRepository(ScadaLinkDbContext context)
/// <param name="logger">Optional logger instance.</param>
public NotificationOutboxRepository(ScadaLinkDbContext context, ILogger<NotificationOutboxRepository>? logger = null)
{
_context = context ?? throw new ArgumentNullException(nameof(context));
_logger = logger ?? NullLogger<NotificationOutboxRepository>.Instance;
}
/// <inheritdoc />
public async Task<bool> InsertIfNotExistsAsync(Notification n, CancellationToken cancellationToken = default)
{
var exists = await _context.Notifications
.AnyAsync(x => x.NotificationId == n.NotificationId, cancellationToken);
if (exists)
if (n is null)
{
return false;
throw new ArgumentNullException(nameof(n));
}
await _context.Notifications.AddAsync(n, cancellationToken);
await _context.SaveChangesAsync(cancellationToken);
return true;
// Enum columns are stored as varchar(32) (HasConversion<string>()); convert
// in C# rather than relying on parameter type inference (SqlClient would
// otherwise bind enums as int by default and break the column conversion).
var type = n.Type.ToString();
var status = n.Status.ToString();
// FormattableString interpolation parameterises every value (no concatenation),
// so this is safe against injection even for the string columns.
try
{
var rowsAffected = await _context.Database.ExecuteSqlInterpolatedAsync(
$@"IF NOT EXISTS (SELECT 1 FROM dbo.Notifications WHERE NotificationId = {n.NotificationId})
INSERT INTO dbo.Notifications
(NotificationId, Type, ListName, Subject, Body, TypeData, Status, RetryCount, LastError,
ResolvedTargets, SourceSiteId, SourceNode, SourceInstanceId, SourceScript,
OriginExecutionId, OriginParentExecutionId,
SiteEnqueuedAt, CreatedAt, LastAttemptAt, NextAttemptAt, DeliveredAt)
VALUES
({n.NotificationId}, {type}, {n.ListName}, {n.Subject}, {n.Body}, {n.TypeData}, {status}, {n.RetryCount}, {n.LastError},
{n.ResolvedTargets}, {n.SourceSiteId}, {n.SourceNode}, {n.SourceInstanceId}, {n.SourceScript},
{n.OriginExecutionId}, {n.OriginParentExecutionId},
{n.SiteEnqueuedAt}, {n.CreatedAt}, {n.LastAttemptAt}, {n.NextAttemptAt}, {n.DeliveredAt});",
cancellationToken);
// rowsAffected == 1 -> we inserted; 0 -> a prior row was already there
// (IF NOT EXISTS short-circuited the INSERT).
return rowsAffected == 1;
}
catch (SqlException ex) when (
ex.Number == SqlErrorUniqueIndexViolation
|| ex.Number == SqlErrorPrimaryKeyViolation)
{
// Two concurrent sessions both passed IF NOT EXISTS and both
// attempted the INSERT — the loser raises 2601/2627 against the
// NotificationId primary key. First-write-wins idempotency is the
// documented contract (the site→central handoff is at-least-once,
// and the actor discards the return value), so the race outcome is
// semantically a no-op. Returning false here matches the
// "row already existed" branch of the success path.
_logger.LogDebug(
ex,
"InsertIfNotExistsAsync swallowed duplicate-key violation (error {SqlErrorNumber}) for NotificationId {NotificationId}; treating as no-op.",
ex.Number,
n.NotificationId);
return false;
}
}
/// <inheritdoc />