fix(concurrency): close 8 race / thread-safety findings across CD, DCL, SR
CD-015: rewrite NotificationOutboxRepository.InsertIfNotExistsAsync as raw-SQL IF NOT EXISTS … INSERT with SqlException 2601/2627 catch, ending the at-least-once livelock on the site→central notification handoff. DCL-018/019/020/021/022: add _subscribesInFlight guard so concurrent same-tag subscribes don't orphan an adapter handle; delete the latent dead _subscriptionHandles dictionary; stop double-counting _totalSubscribed when an unresolved tag is promoted via another instance; release adapter handles on mid-flight unsubscribe; gate the tag-resolution retry timer with IsTimerActive so subscribe bursts don't reset it into starvation. SR-020: add _terminatingActorsByName shadow so a third deploy arriving during a pending redeploy doesn't crash on InvalidActorNameException — displaced senders get a Failed/superseded response and the latest command wins on Terminated. SR-024: split OperationTrackingStore reads from writes (fresh SqliteConnection per GetStatusAsync) so long writes don't block status queries; rewrite Dispose to drop the sync-over-async bridge that could deadlock on a non-reentrant SyncContext; Interlocked.Exchange makes the dispose-once flag race-safe across both paths.
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
using Microsoft.Data.SqlClient;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ScadaLink.Commons.Entities.Notifications;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
@@ -12,7 +15,20 @@ namespace ScadaLink.ConfigurationDatabase.Repositories;
|
||||
/// </summary>
|
||||
public class NotificationOutboxRepository : INotificationOutboxRepository
|
||||
{
|
||||
// SQL Server duplicate-key error numbers, matching the AuditLogRepository
|
||||
// and SiteCallAuditRepository race-fixes. 2601 is a unique-index violation;
|
||||
// 2627 is a primary-key/unique-constraint violation. The IF NOT EXISTS …
|
||||
// INSERT pattern has a check-then-act race window — two sessions can both
|
||||
// pass the EXISTS check and then both attempt the INSERT — and the loser
|
||||
// surfaces as one of these. The site→central handoff is documented
|
||||
// at-least-once with insert-if-not-exists, so the collision IS the expected
|
||||
// contention mode; idempotency demands we swallow them rather than let the
|
||||
// site retry the same NotificationId forever.
|
||||
private const int SqlErrorUniqueIndexViolation = 2601;
|
||||
private const int SqlErrorPrimaryKeyViolation = 2627;
|
||||
|
||||
private readonly ScadaLinkDbContext _context;
|
||||
private readonly ILogger<NotificationOutboxRepository> _logger;
|
||||
|
||||
// Statuses that represent a finished notification lifecycle. Non-terminal is the complement.
|
||||
private static readonly NotificationStatus[] TerminalStatuses =
|
||||
@@ -24,24 +40,67 @@ public class NotificationOutboxRepository : INotificationOutboxRepository
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="NotificationOutboxRepository"/> with the given EF Core context.</summary>
|
||||
/// <param name="context">The EF Core database context.</param>
|
||||
public NotificationOutboxRepository(ScadaLinkDbContext context)
|
||||
/// <param name="logger">Optional logger instance.</param>
|
||||
public NotificationOutboxRepository(ScadaLinkDbContext context, ILogger<NotificationOutboxRepository>? logger = null)
|
||||
{
|
||||
_context = context ?? throw new ArgumentNullException(nameof(context));
|
||||
_logger = logger ?? NullLogger<NotificationOutboxRepository>.Instance;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> InsertIfNotExistsAsync(Notification n, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var exists = await _context.Notifications
|
||||
.AnyAsync(x => x.NotificationId == n.NotificationId, cancellationToken);
|
||||
if (exists)
|
||||
if (n is null)
|
||||
{
|
||||
return false;
|
||||
throw new ArgumentNullException(nameof(n));
|
||||
}
|
||||
|
||||
await _context.Notifications.AddAsync(n, cancellationToken);
|
||||
await _context.SaveChangesAsync(cancellationToken);
|
||||
return true;
|
||||
// Enum columns are stored as varchar(32) (HasConversion<string>()); convert
|
||||
// in C# rather than relying on parameter type inference (SqlClient would
|
||||
// otherwise bind enums as int by default and break the column conversion).
|
||||
var type = n.Type.ToString();
|
||||
var status = n.Status.ToString();
|
||||
|
||||
// FormattableString interpolation parameterises every value (no concatenation),
|
||||
// so this is safe against injection even for the string columns.
|
||||
try
|
||||
{
|
||||
var rowsAffected = await _context.Database.ExecuteSqlInterpolatedAsync(
|
||||
$@"IF NOT EXISTS (SELECT 1 FROM dbo.Notifications WHERE NotificationId = {n.NotificationId})
|
||||
INSERT INTO dbo.Notifications
|
||||
(NotificationId, Type, ListName, Subject, Body, TypeData, Status, RetryCount, LastError,
|
||||
ResolvedTargets, SourceSiteId, SourceNode, SourceInstanceId, SourceScript,
|
||||
OriginExecutionId, OriginParentExecutionId,
|
||||
SiteEnqueuedAt, CreatedAt, LastAttemptAt, NextAttemptAt, DeliveredAt)
|
||||
VALUES
|
||||
({n.NotificationId}, {type}, {n.ListName}, {n.Subject}, {n.Body}, {n.TypeData}, {status}, {n.RetryCount}, {n.LastError},
|
||||
{n.ResolvedTargets}, {n.SourceSiteId}, {n.SourceNode}, {n.SourceInstanceId}, {n.SourceScript},
|
||||
{n.OriginExecutionId}, {n.OriginParentExecutionId},
|
||||
{n.SiteEnqueuedAt}, {n.CreatedAt}, {n.LastAttemptAt}, {n.NextAttemptAt}, {n.DeliveredAt});",
|
||||
cancellationToken);
|
||||
|
||||
// rowsAffected == 1 -> we inserted; 0 -> a prior row was already there
|
||||
// (IF NOT EXISTS short-circuited the INSERT).
|
||||
return rowsAffected == 1;
|
||||
}
|
||||
catch (SqlException ex) when (
|
||||
ex.Number == SqlErrorUniqueIndexViolation
|
||||
|| ex.Number == SqlErrorPrimaryKeyViolation)
|
||||
{
|
||||
// Two concurrent sessions both passed IF NOT EXISTS and both
|
||||
// attempted the INSERT — the loser raises 2601/2627 against the
|
||||
// NotificationId primary key. First-write-wins idempotency is the
|
||||
// documented contract (the site→central handoff is at-least-once,
|
||||
// and the actor discards the return value), so the race outcome is
|
||||
// semantically a no-op. Returning false here matches the
|
||||
// "row already existed" branch of the success path.
|
||||
_logger.LogDebug(
|
||||
ex,
|
||||
"InsertIfNotExistsAsync swallowed duplicate-key violation (error {SqlErrorNumber}) for NotificationId {NotificationId}; treating as no-op.",
|
||||
ex.Number,
|
||||
n.NotificationId);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
||||
@@ -71,6 +71,22 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
/// </summary>
|
||||
private readonly HashSet<string> _resolutionInFlight = new();
|
||||
|
||||
/// <summary>
|
||||
/// DataConnectionLayer-018: tags whose initial SubscribeAsync (issued from
|
||||
/// <see cref="HandleSubscribe"/>) is currently in flight. Two parallel
|
||||
/// <c>SubscribeTagsRequest</c> messages for different instances sharing a tag
|
||||
/// path would otherwise both observe "not subscribed" against
|
||||
/// <see cref="_subscriptionIds"/> (the in-flight task has not yet posted its
|
||||
/// <see cref="SubscribeCompleted"/>), both call <c>_adapter.SubscribeAsync</c>,
|
||||
/// and the second subscription id gets silently dropped at the existing
|
||||
/// <c>_subscriptionIds.ContainsKey</c> guard in <see cref="HandleSubscribeCompleted"/>
|
||||
/// — orphaning the adapter's monitored item (duplicate notifications + leaked
|
||||
/// memory until the connection drops). This set is read+written only on the
|
||||
/// actor thread and cleared in <see cref="ReSubscribeAll"/> for symmetry with
|
||||
/// <see cref="_resolutionInFlight"/>.
|
||||
/// </summary>
|
||||
private readonly HashSet<string> _subscribesInFlight = new();
|
||||
|
||||
/// <summary>
|
||||
/// Subscribers: instanceUniqueName → IActorRef (the Instance Actor).
|
||||
/// </summary>
|
||||
@@ -550,27 +566,43 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
// from this adapter can be distinguished from a later (post-failover) adapter.
|
||||
var generation = _adapterGeneration;
|
||||
|
||||
// Snapshot the already-subscribed tag set on the actor thread. The background
|
||||
// task below must NOT read or mutate actor state — it performs only adapter
|
||||
// I/O and reports results back via a SubscribeCompleted message, which is
|
||||
// applied to actor state on the actor thread (see HandleSubscribeCompleted).
|
||||
var alreadySubscribed = new HashSet<string>(_subscriptionIds.Keys);
|
||||
// DataConnectionLayer-018: partition tags on the actor thread into "this
|
||||
// request will issue _adapter.SubscribeAsync" vs. "already subscribed (by us
|
||||
// or by another in-flight SubscribeTagsRequest)". A tag that is already in
|
||||
// _subscriptionIds OR currently in _subscribesInFlight is treated as
|
||||
// AlreadySubscribed — the eventual SubscribeCompleted of the in-flight
|
||||
// request will populate _subscriptionIds, at which point a subsequent
|
||||
// unsubscribe by either instance correctly references the adapter handle.
|
||||
// The background task below must NOT read or mutate actor state — these
|
||||
// partitioned lists are the only state it sees.
|
||||
var tagsToSubscribe = new List<string>(request.TagPaths.Count);
|
||||
var preResolvedResults = new List<SubscribeTagResult>();
|
||||
foreach (var tagPath in request.TagPaths)
|
||||
{
|
||||
if (_subscriptionIds.ContainsKey(tagPath) || _subscribesInFlight.Contains(tagPath))
|
||||
{
|
||||
preResolvedResults.Add(new SubscribeTagResult(
|
||||
tagPath, AlreadySubscribed: true, Success: true, null, null));
|
||||
}
|
||||
else
|
||||
{
|
||||
tagsToSubscribe.Add(tagPath);
|
||||
_subscribesInFlight.Add(tagPath);
|
||||
}
|
||||
}
|
||||
|
||||
Task.Run(async () =>
|
||||
{
|
||||
var results = new List<SubscribeTagResult>(request.TagPaths.Count);
|
||||
var tagsToSeed = new List<string>();
|
||||
|
||||
foreach (var tagPath in request.TagPaths)
|
||||
results.AddRange(preResolvedResults);
|
||||
var tagsToSeed = new List<string>(preResolvedResults.Count + tagsToSubscribe.Count);
|
||||
foreach (var r in preResolvedResults)
|
||||
{
|
||||
if (alreadySubscribed.Contains(tagPath))
|
||||
{
|
||||
// Already subscribed by another instance — just track for this one.
|
||||
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: true, Success: true, null, null));
|
||||
tagsToSeed.Add(tagPath);
|
||||
continue;
|
||||
}
|
||||
tagsToSeed.Add(r.TagPath);
|
||||
}
|
||||
|
||||
foreach (var tagPath in tagsToSubscribe)
|
||||
{
|
||||
try
|
||||
{
|
||||
var subId = await _adapter.SubscribeAsync(tagPath, (path, value) =>
|
||||
@@ -628,9 +660,45 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
var instanceName = msg.Request.InstanceUniqueName;
|
||||
if (!_subscriptionsByInstance.TryGetValue(instanceName, out var instanceTags))
|
||||
{
|
||||
// The instance was unsubscribed while the subscribe I/O was in flight.
|
||||
instanceTags = new HashSet<string>();
|
||||
_subscriptionsByInstance[instanceName] = instanceTags;
|
||||
// DataConnectionLayer-021: the instance was unsubscribed while the
|
||||
// subscribe I/O was in flight. Re-creating the per-instance entry and
|
||||
// applying counter/handle mutations here would permanently leak state
|
||||
// — _subscriptionsByInstance[instanceName] resurrected with no
|
||||
// subscriber to receive callbacks, _tagSubscriberCount inflated forever
|
||||
// (no future HandleUnsubscribe will drop it), and _totalSubscribed /
|
||||
// _resolvedTags drifting above the real instance count across the
|
||||
// adapter lifetime (also re-issued by ReSubscribeAll on every
|
||||
// reconnect). Instead: drop all state mutations for this stale
|
||||
// message and release the adapter-level monitored items we just
|
||||
// created so the device doesn't keep streaming change notifications
|
||||
// for a tag nobody is subscribed to.
|
||||
_log.Warning(
|
||||
"[{0}] SubscribeCompleted arrived for instance {1} but the instance " +
|
||||
"was unsubscribed while the subscribe was in flight; releasing " +
|
||||
"{2} adapter handle(s) and discarding state mutations.",
|
||||
_connectionName, instanceName, msg.Results.Count(r => r.Success && !r.AlreadySubscribed));
|
||||
|
||||
foreach (var result in msg.Results)
|
||||
{
|
||||
// DCL-018: clear in-flight markers we placed in HandleSubscribe.
|
||||
if (!result.AlreadySubscribed)
|
||||
_subscribesInFlight.Remove(result.TagPath);
|
||||
|
||||
// Fire-and-forget release of any subscription id this request
|
||||
// genuinely created. AlreadySubscribed=true means another caller
|
||||
// owns the adapter handle and unsubscribing it would break them.
|
||||
if (result is { Success: true, AlreadySubscribed: false, SubscriptionId: not null })
|
||||
{
|
||||
_ = _adapter.UnsubscribeAsync(result.SubscriptionId);
|
||||
}
|
||||
}
|
||||
|
||||
// The original sender is already gone (unsubscribed). Telling a dead
|
||||
// ref produces a dead letter, which is the harmless and observable
|
||||
// outcome — but skipping the reply altogether keeps dead-letter noise
|
||||
// out of the log when this race fires in the normal disable/redeploy
|
||||
// path. The unsubscribe message did NOT request a response of its own.
|
||||
return false;
|
||||
}
|
||||
|
||||
// DataConnectionLayer-004: if any tag failed because the adapter is not
|
||||
@@ -641,6 +709,15 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
foreach (var result in msg.Results)
|
||||
{
|
||||
// DataConnectionLayer-018: a result with AlreadySubscribed: false means
|
||||
// this request was responsible for the SubscribeAsync call — the tag
|
||||
// was added to _subscribesInFlight in HandleSubscribe. Clear it now so
|
||||
// a later SubscribeTagsRequest for the same tag isn't forever treated
|
||||
// as in-flight. AlreadySubscribed: true tags were not added to the
|
||||
// set (another request owned the in-flight slot).
|
||||
if (!result.AlreadySubscribed)
|
||||
_subscribesInFlight.Remove(result.TagPath);
|
||||
|
||||
// DataConnectionLayer-008: only a tag newly added to THIS instance's set
|
||||
// increments the reference count, so the count stays an accurate "number
|
||||
// of distinct instances subscribed to this tag".
|
||||
@@ -656,8 +733,27 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
if (result.Success)
|
||||
{
|
||||
_subscriptionIds[result.TagPath] = result.SubscriptionId!;
|
||||
_totalSubscribed++;
|
||||
_resolvedTags++;
|
||||
|
||||
// DataConnectionLayer-020: distinguish fresh subscribe from
|
||||
// unresolved → resolved promotion. If an earlier instance's
|
||||
// subscribe for this tag had failed at the resolution layer
|
||||
// (the tag was already added to _unresolvedTags AND already
|
||||
// counted in _totalSubscribed), this success transitions it
|
||||
// from unresolved to resolved — increment _resolvedTags ONLY.
|
||||
// Incrementing _totalSubscribed again here would over-count by
|
||||
// one until HandleTagResolutionSucceeded reconciled. Mirrors
|
||||
// HandleTagResolutionSucceeded's promotion shape so both paths
|
||||
// resolve a previously-failed tag identically.
|
||||
if (_unresolvedTags.Remove(result.TagPath))
|
||||
{
|
||||
_resolutionInFlight.Remove(result.TagPath);
|
||||
_resolvedTags++;
|
||||
}
|
||||
else
|
||||
{
|
||||
_totalSubscribed++;
|
||||
_resolvedTags++;
|
||||
}
|
||||
}
|
||||
else if (result.ConnectionLevelFailure)
|
||||
{
|
||||
@@ -670,9 +766,17 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
else
|
||||
{
|
||||
// WP-12: genuine tag resolution failure — mark unresolved so the
|
||||
// periodic retry timer picks it up.
|
||||
_unresolvedTags.Add(result.TagPath);
|
||||
_totalSubscribed++;
|
||||
// periodic retry timer picks it up. DataConnectionLayer-020:
|
||||
// only increment _totalSubscribed when the tag is genuinely
|
||||
// newly-tracked. A second instance failing to resolve a tag the
|
||||
// first instance already added to _unresolvedTags is the same
|
||||
// logical tag, counted once — bumping _totalSubscribed again
|
||||
// would over-report TotalSubscribedTags forever.
|
||||
var newlyUnresolved = _unresolvedTags.Add(result.TagPath);
|
||||
if (newlyUnresolved)
|
||||
{
|
||||
_totalSubscribed++;
|
||||
}
|
||||
_log.Debug("[{0}] Tag resolution failed for {1}: {2}",
|
||||
_connectionName, result.TagPath, result.Error);
|
||||
|
||||
@@ -688,7 +792,13 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
}
|
||||
|
||||
// Start the tag-resolution retry timer if any tags are unresolved.
|
||||
if (_unresolvedTags.Count > 0)
|
||||
// DataConnectionLayer-022: StartPeriodicTimer with an existing key CANCELS
|
||||
// and replaces the prior timer, so a fan-out of SubscribeTagsRequests
|
||||
// arriving faster than TagResolutionRetryInterval would keep resetting
|
||||
// the timer and starve the retry indefinitely. Gating on IsTimerActive
|
||||
// means the first failure starts the timer and subsequent failures
|
||||
// simply pile onto _unresolvedTags without restarting the clock.
|
||||
if (_unresolvedTags.Count > 0 && !Timers.IsTimerActive("tag-resolution-retry"))
|
||||
{
|
||||
Timers.StartPeriodicTimer(
|
||||
"tag-resolution-retry",
|
||||
@@ -918,6 +1028,12 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_subscriptionIds.Clear();
|
||||
_unresolvedTags.Clear();
|
||||
_resolutionInFlight.Clear();
|
||||
// DataConnectionLayer-018: symmetric with _resolutionInFlight — any pending
|
||||
// initial-subscribe completions from the previous adapter generation will
|
||||
// post SubscribeCompleted to the actor, but ReSubscribeAll has just emptied
|
||||
// the in-flight tracking; the stale completion simply has nothing to
|
||||
// remove (idempotent HashSet.Remove on a missing key).
|
||||
_subscribesInFlight.Clear();
|
||||
_resolvedTags = 0;
|
||||
|
||||
// DataConnectionLayer-006: reset the quality tracking too. Otherwise tags
|
||||
@@ -987,8 +1103,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
// it is eligible for the next retry tick again.
|
||||
_resolutionInFlight.Remove(msg.TagPath);
|
||||
|
||||
// Track as unresolved so periodic retry picks it up
|
||||
if (_unresolvedTags.Add(msg.TagPath))
|
||||
// Track as unresolved so periodic retry picks it up. DCL-022: gate on
|
||||
// IsTimerActive so a stream of TagResolutionFailed events doesn't keep
|
||||
// cancelling and re-starting the timer faster than its own interval.
|
||||
if (_unresolvedTags.Add(msg.TagPath) && !Timers.IsTimerActive("tag-resolution-retry"))
|
||||
{
|
||||
Timers.StartPeriodicTimer(
|
||||
"tag-resolution-retry",
|
||||
|
||||
@@ -25,10 +25,16 @@ public class OpcUaDataConnection : IDataConnection
|
||||
private string _endpointUrl = string.Empty;
|
||||
private ConnectionHealth _status = ConnectionHealth.Disconnected;
|
||||
|
||||
/// <summary>
|
||||
/// Maps subscription IDs to their tag paths for cleanup.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, string> _subscriptionHandles = new();
|
||||
// DataConnectionLayer-019: the previous _subscriptionHandles Dictionary was
|
||||
// dead state — written by SubscribeAsync, removed by UnsubscribeAsync, but
|
||||
// never read anywhere. Plain Dictionary writes from thread-pool continuations
|
||||
// after await are racy (concurrent resize is undefined: InvalidOperationException,
|
||||
// bucket corruption, or silently lost entries). Bookkeeping for subscriptions
|
||||
// lives at two genuine layers: RealOpcUaClient._monitoredItems/_callbacks
|
||||
// (already ConcurrentDictionary per DCL-003) at the device adapter, and
|
||||
// DataConnectionActor._subscriptionIds at the actor — both authoritative.
|
||||
// The adapter has no need for a third copy; the field is removed rather than
|
||||
// converted to ConcurrentDictionary because there is no reader.
|
||||
private StaleTagMonitor? _staleMonitor;
|
||||
private string? _heartbeatSubscriptionId;
|
||||
|
||||
@@ -155,7 +161,10 @@ public class OpcUaDataConnection : IDataConnection
|
||||
{
|
||||
EnsureConnected();
|
||||
|
||||
var subscriptionId = await _client!.CreateSubscriptionAsync(
|
||||
// DataConnectionLayer-019: subscriptionId is returned directly to the
|
||||
// caller (DataConnectionActor stores it in _subscriptionIds). No local
|
||||
// bookkeeping is kept here — see the field-deletion note above.
|
||||
return await _client!.CreateSubscriptionAsync(
|
||||
tagPath,
|
||||
(nodeId, value, timestamp, statusCode) =>
|
||||
{
|
||||
@@ -163,9 +172,6 @@ public class OpcUaDataConnection : IDataConnection
|
||||
callback(tagPath, new TagValue(value, quality, new DateTimeOffset(timestamp, TimeSpan.Zero)));
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_subscriptionHandles[subscriptionId] = tagPath;
|
||||
return subscriptionId;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -174,7 +180,6 @@ public class OpcUaDataConnection : IDataConnection
|
||||
if (_client != null)
|
||||
{
|
||||
await _client.RemoveSubscriptionAsync(subscriptionId, cancellationToken);
|
||||
_subscriptionHandles.Remove(subscriptionId);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,22 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
/// confirms the child has fully stopped (SiteRuntime-003).
|
||||
/// </summary>
|
||||
private readonly Dictionary<IActorRef, PendingRedeploy> _pendingRedeploys = new();
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-020: name → terminating actor ref shadow of <see cref="_pendingRedeploys"/>.
|
||||
/// Required because a third <see cref="DeployInstanceCommand"/> for the same instance
|
||||
/// arriving WHILE a redeploy is still mid-termination would otherwise see
|
||||
/// <c>_instanceActors.TryGetValue == false</c> and fall through to
|
||||
/// <see cref="ApplyDeployment"/> + <see cref="CreateInstanceActor"/>, where
|
||||
/// <c>Context.ActorOf(props, instanceName)</c> throws <c>InvalidActorNameException</c>
|
||||
/// — the child name is still registered until the <see cref="Terminated"/> signal fires.
|
||||
/// The supervisor's Stop directive then drops the deploy command silently, leaving the
|
||||
/// deployer waiting forever and persistence dangling. The shadow index lets
|
||||
/// <see cref="HandleDeploy"/> detect the mid-termination state and overwrite the
|
||||
/// buffered pending command (last-write-wins) instead of trying to create a fresh actor.
|
||||
/// Cleared in <see cref="HandleTerminated"/> alongside <see cref="_pendingRedeploys"/>.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, IActorRef> _terminatingActorsByName = new();
|
||||
private int _totalDeployedCount;
|
||||
|
||||
/// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
|
||||
@@ -296,12 +312,36 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
_instanceActors.Remove(instanceName);
|
||||
_pendingRedeploys[existing] = new PendingRedeploy(command, Sender);
|
||||
_terminatingActorsByName[instanceName] = existing;
|
||||
Context.Watch(existing);
|
||||
Context.Stop(existing);
|
||||
UpdateInstanceCounts();
|
||||
return;
|
||||
}
|
||||
|
||||
// SiteRuntime-020: a deploy arriving while the previous redeploy is still
|
||||
// terminating (the Terminated signal hasn't fired yet) used to fall through
|
||||
// to ApplyDeployment(fresh), where Context.ActorOf would throw
|
||||
// InvalidActorNameException because the child name is still registered.
|
||||
// Detect the mid-termination state and overwrite the buffered pending
|
||||
// command (last-write-wins) so the latest deploy is applied when Terminated
|
||||
// arrives. The displaced sender is told Failed-superseded so it doesn't
|
||||
// wait forever.
|
||||
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
||||
{
|
||||
if (_pendingRedeploys.TryGetValue(terminatingRef, out var displaced))
|
||||
{
|
||||
displaced.OriginalSender.Tell(new DeploymentStatusResponse(
|
||||
displaced.Command.DeploymentId,
|
||||
instanceName,
|
||||
DeploymentStatus.Failed,
|
||||
$"superseded by newer deployment {command.DeploymentId} before predecessor finished terminating",
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
_pendingRedeploys[terminatingRef] = new PendingRedeploy(command, Sender);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fresh deployment — no existing actor to replace.
|
||||
ApplyDeployment(command, Sender, isRedeploy: false);
|
||||
}
|
||||
@@ -315,6 +355,13 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
if (!_pendingRedeploys.Remove(terminated.ActorRef, out var pending))
|
||||
return;
|
||||
|
||||
// SiteRuntime-020: drop the name-keyed shadow now that the predecessor has
|
||||
// fully terminated and its actor name is free. Any deploy arriving after
|
||||
// this point sees neither _instanceActors[name] (cleared when we stopped
|
||||
// the predecessor) nor _terminatingActorsByName[name] (cleared here), so
|
||||
// ApplyDeployment + Context.ActorOf below safely reuses the name.
|
||||
_terminatingActorsByName.Remove(pending.Command.InstanceUniqueName);
|
||||
|
||||
ApplyDeployment(pending.Command, pending.OriginalSender, isRedeploy: true);
|
||||
}
|
||||
|
||||
|
||||
@@ -36,10 +36,20 @@ namespace ScadaLink.SiteRuntime.Tracking;
|
||||
/// </remarks>
|
||||
public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable, IDisposable
|
||||
{
|
||||
private readonly SqliteConnection _connection;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
// SiteRuntime-024: writer state — one owned SqliteConnection serialised behind
|
||||
// _writeGate. Readers do NOT share this connection or gate; see GetStatusAsync.
|
||||
private readonly SqliteConnection _writeConnection;
|
||||
private readonly SemaphoreSlim _writeGate = new(1, 1);
|
||||
private readonly string _connectionString;
|
||||
private readonly ILogger<OperationTrackingStore> _logger;
|
||||
private bool _disposed;
|
||||
|
||||
// SiteRuntime-024: dispose-once state shared by the sync Dispose and async
|
||||
// DisposeAsync paths. Interlocked.Exchange is the race-safe primitive here —
|
||||
// a plain bool can be flipped twice if Dispose() and DisposeAsync() are
|
||||
// invoked concurrently (e.g. host shutdown bridging both). 0 = live,
|
||||
// 1 = disposed. Read by other methods via Volatile.Read after the gate is
|
||||
// taken; they raise ObjectDisposedException when set.
|
||||
private int _disposeState;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the tracking store, opens the SQLite connection, and applies the schema.
|
||||
@@ -54,14 +64,15 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_logger = logger;
|
||||
_connection = new SqliteConnection(options.Value.ConnectionString);
|
||||
_connection.Open();
|
||||
_connectionString = options.Value.ConnectionString;
|
||||
_writeConnection = new SqliteConnection(_connectionString);
|
||||
_writeConnection.Open();
|
||||
InitializeSchema();
|
||||
}
|
||||
|
||||
private void InitializeSchema()
|
||||
{
|
||||
using var cmd = _connection.CreateCommand();
|
||||
using var cmd = _writeConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
CREATE TABLE IF NOT EXISTS OperationTracking (
|
||||
TrackedOperationId TEXT NOT NULL PRIMARY KEY,
|
||||
@@ -112,7 +123,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
/// </summary>
|
||||
private void AddColumnIfMissing(string columnName, string columnDefinition)
|
||||
{
|
||||
using var probe = _connection.CreateCommand();
|
||||
using var probe = _writeConnection.CreateCommand();
|
||||
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('OperationTracking') WHERE name = $name";
|
||||
probe.Parameters.AddWithValue("$name", columnName);
|
||||
var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
|
||||
@@ -121,7 +132,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
return;
|
||||
}
|
||||
|
||||
using var alter = _connection.CreateCommand();
|
||||
using var alter = _writeConnection.CreateCommand();
|
||||
// Column name + definition are caller-controlled constants, never user
|
||||
// input — safe to interpolate (parameters are not permitted in DDL).
|
||||
alter.CommandText = $"ALTER TABLE OperationTracking ADD COLUMN {columnName} {columnDefinition}";
|
||||
@@ -140,14 +151,14 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(kind);
|
||||
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
await _writeGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this);
|
||||
|
||||
var now = DateTime.UtcNow.ToString("o", CultureInfo.InvariantCulture);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
using var cmd = _writeConnection.CreateCommand();
|
||||
// INSERT OR IGNORE: duplicate ids are no-ops (first-write-wins) —
|
||||
// matches the at-least-once semantics the site emits under.
|
||||
cmd.CommandText = """
|
||||
@@ -176,7 +187,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
_writeGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -191,14 +202,14 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(status);
|
||||
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
await _writeGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this);
|
||||
|
||||
var now = DateTime.UtcNow.ToString("o", CultureInfo.InvariantCulture);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
using var cmd = _writeConnection.CreateCommand();
|
||||
// Terminal rows are immutable — the WHERE clause filters them out so
|
||||
// late-arriving attempt telemetry never overwrites a resolved row.
|
||||
cmd.CommandText = """
|
||||
@@ -222,7 +233,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
_writeGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -236,14 +247,14 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(status);
|
||||
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
await _writeGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this);
|
||||
|
||||
var now = DateTime.UtcNow.ToString("o", CultureInfo.InvariantCulture);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
using var cmd = _writeConnection.CreateCommand();
|
||||
// First-write-wins on the terminal flip: only update rows that
|
||||
// haven't already terminated.
|
||||
cmd.CommandText = """
|
||||
@@ -266,7 +277,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
_writeGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,47 +286,48 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
TrackedOperationId id,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this);
|
||||
|
||||
// SiteRuntime-024: reads open a fresh, ungated SqliteConnection so a
|
||||
// long-running write doesn't block status queries. The connection
|
||||
// string is shared with the writer; SQLite handles cross-connection
|
||||
// isolation natively (a reader sees a consistent snapshot via the
|
||||
// shared cache lock for in-memory DBs, or a WAL snapshot for file DBs).
|
||||
// Mirrors the SiteStorageService precedent.
|
||||
await using var readConnection = new SqliteConnection(_connectionString);
|
||||
await readConnection.OpenAsync(ct).ConfigureAwait(false);
|
||||
|
||||
await using var cmd = readConnection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT TrackedOperationId, Kind, TargetSummary, Status,
|
||||
RetryCount, LastError, HttpStatus,
|
||||
CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc,
|
||||
SourceInstanceId, SourceScript, SourceNode
|
||||
FROM OperationTracking
|
||||
WHERE TrackedOperationId = $id;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$id", id.ToString());
|
||||
|
||||
await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(ct).ConfigureAwait(false))
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT TrackedOperationId, Kind, TargetSummary, Status,
|
||||
RetryCount, LastError, HttpStatus,
|
||||
CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc,
|
||||
SourceInstanceId, SourceScript, SourceNode
|
||||
FROM OperationTracking
|
||||
WHERE TrackedOperationId = $id;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$id", id.ToString());
|
||||
|
||||
using var reader = cmd.ExecuteReader();
|
||||
if (!reader.Read())
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new TrackingStatusSnapshot(
|
||||
Id: TrackedOperationId.Parse(reader.GetString(0)),
|
||||
Kind: reader.GetString(1),
|
||||
TargetSummary: reader.IsDBNull(2) ? null : reader.GetString(2),
|
||||
Status: reader.GetString(3),
|
||||
RetryCount: reader.GetInt32(4),
|
||||
LastError: reader.IsDBNull(5) ? null : reader.GetString(5),
|
||||
HttpStatus: reader.IsDBNull(6) ? null : reader.GetInt32(6),
|
||||
CreatedAtUtc: ParseUtc(reader.GetString(7)),
|
||||
UpdatedAtUtc: ParseUtc(reader.GetString(8)),
|
||||
TerminalAtUtc: reader.IsDBNull(9) ? null : ParseUtc(reader.GetString(9)),
|
||||
SourceInstanceId: reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
SourceScript: reader.IsDBNull(11) ? null : reader.GetString(11),
|
||||
SourceNode: reader.IsDBNull(12) ? null : reader.GetString(12));
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
return null;
|
||||
}
|
||||
|
||||
return new TrackingStatusSnapshot(
|
||||
Id: TrackedOperationId.Parse(reader.GetString(0)),
|
||||
Kind: reader.GetString(1),
|
||||
TargetSummary: reader.IsDBNull(2) ? null : reader.GetString(2),
|
||||
Status: reader.GetString(3),
|
||||
RetryCount: reader.GetInt32(4),
|
||||
LastError: reader.IsDBNull(5) ? null : reader.GetString(5),
|
||||
HttpStatus: reader.IsDBNull(6) ? null : reader.GetInt32(6),
|
||||
CreatedAtUtc: ParseUtc(reader.GetString(7)),
|
||||
UpdatedAtUtc: ParseUtc(reader.GetString(8)),
|
||||
TerminalAtUtc: reader.IsDBNull(9) ? null : ParseUtc(reader.GetString(9)),
|
||||
SourceInstanceId: reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
SourceScript: reader.IsDBNull(11) ? null : reader.GetString(11),
|
||||
SourceNode: reader.IsDBNull(12) ? null : reader.GetString(12));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
@@ -323,12 +335,12 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
DateTime olderThanUtc,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await _gate.WaitAsync(ct).ConfigureAwait(false);
|
||||
await _writeGate.WaitAsync(ct).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposeState) != 0, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
using var cmd = _writeConnection.CreateCommand();
|
||||
// Non-terminal rows (TerminalAtUtc IS NULL) are kept regardless of
|
||||
// age — the operation is still in flight.
|
||||
cmd.CommandText = """
|
||||
@@ -344,7 +356,7 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
_writeGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,33 +368,68 @@ public class OperationTrackingStore : IOperationTrackingStore, IAsyncDisposable,
|
||||
DateTimeStyles.RoundtripKind);
|
||||
}
|
||||
|
||||
/// <summary>Synchronously disposes the tracking store and its SQLite connection.</summary>
|
||||
/// <summary>
|
||||
/// Synchronously disposes the tracking store and its SQLite connection.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// SiteRuntime-024: this path does NOT bridge to async via
|
||||
/// <c>.AsTask().GetAwaiter().GetResult()</c>. Sync-over-async on a SemaphoreSlim
|
||||
/// can deadlock when invoked from a non-reentrant SyncContext (e.g. host
|
||||
/// shutdown continuations observed on the host sync context). In-flight writes
|
||||
/// at the moment of <see cref="Dispose"/> will fail their next operation
|
||||
/// against the disposed connection with <see cref="ObjectDisposedException"/> —
|
||||
/// the caller's responsibility is to ensure no concurrent operations during
|
||||
/// the synchronous dispose. Use <see cref="DisposeAsync"/> if you need to
|
||||
/// drain in-flight writes before close.
|
||||
/// </remarks>
|
||||
public void Dispose()
|
||||
{
|
||||
DisposeAsyncCore().AsTask().GetAwaiter().GetResult();
|
||||
if (Interlocked.Exchange(ref _disposeState, 1) != 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_writeConnection.Dispose();
|
||||
_writeGate.Dispose();
|
||||
GC.SuppressFinalize(this);
|
||||
}
|
||||
|
||||
/// <summary>Asynchronously disposes the tracking store and its SQLite connection.</summary>
|
||||
/// <summary>
|
||||
/// Asynchronously disposes the tracking store and its SQLite connection.
|
||||
/// Drains in-flight writes by acquiring the write gate before closing the
|
||||
/// connection, so a write currently executing a SqliteCommand completes
|
||||
/// before the connection is freed.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await DisposeAsyncCore().ConfigureAwait(false);
|
||||
GC.SuppressFinalize(this);
|
||||
}
|
||||
if (Interlocked.Exchange(ref _disposeState, 1) != 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
private async ValueTask DisposeAsyncCore()
|
||||
{
|
||||
await _gate.WaitAsync().ConfigureAwait(false);
|
||||
// Drain any in-flight write by taking the write gate. Past this point
|
||||
// no new write can acquire the gate because _disposeState is set, so
|
||||
// the next ThrowIf check in each writer raises ObjectDisposedException.
|
||||
try
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_connection.Dispose();
|
||||
await _writeGate.WaitAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Race with another disposer that already disposed the gate — the
|
||||
// _disposeState exchange above should prevent this, but be defensive.
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_writeConnection.Dispose();
|
||||
}
|
||||
finally
|
||||
{
|
||||
_gate.Release();
|
||||
_gate.Dispose();
|
||||
try { _writeGate.Release(); } catch (ObjectDisposedException) { }
|
||||
_writeGate.Dispose();
|
||||
}
|
||||
|
||||
GC.SuppressFinalize(this);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user