feat(controlplane): ConfigPublishCoordinator deadline timeout + failover PreStart recovery

This commit is contained in:
Joseph Doherty
2026-05-26 04:57:05 -04:00
parent bad2aef137
commit f193872891
2 changed files with 205 additions and 4 deletions

View File

@@ -21,25 +21,78 @@ namespace ZB.MOM.WW.OtOpcUa.ControlPlane.Coordinators;
/// Discovery of the "expected ACK set" comes from <c>Akka.Cluster.State.Members</c> filtered by
/// the <c>driver</c> role — the DB does not own per-node role assignment.
/// </summary>
public sealed class ConfigPublishCoordinator : ReceiveActor
public sealed class ConfigPublishCoordinator : ReceiveActor, IWithTimers
{
public const string DeploymentsTopic = "deployments";
public static readonly TimeSpan DefaultApplyDeadline = TimeSpan.FromMinutes(2);
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
private readonly TimeSpan _applyDeadline;
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly Dictionary<NodeId, ApplyAckOutcome> _acks = new();
private DeploymentId? _current;
private HashSet<NodeId> _expectedAcks = new();
public static Props Props(IDbContextFactory<OtOpcUaConfigDbContext> dbFactory) =>
Akka.Actor.Props.Create(() => new ConfigPublishCoordinator(dbFactory));
public ITimerScheduler Timers { get; set; } = null!;
public ConfigPublishCoordinator(IDbContextFactory<OtOpcUaConfigDbContext> dbFactory)
public static Props Props(
IDbContextFactory<OtOpcUaConfigDbContext> dbFactory,
TimeSpan? applyDeadline = null) =>
Akka.Actor.Props.Create(() => new ConfigPublishCoordinator(dbFactory, applyDeadline ?? DefaultApplyDeadline));
public ConfigPublishCoordinator(
IDbContextFactory<OtOpcUaConfigDbContext> dbFactory,
TimeSpan applyDeadline)
{
_dbFactory = dbFactory;
_applyDeadline = applyDeadline;
Receive<DispatchDeployment>(HandleDispatch);
Receive<ApplyAck>(HandleAck);
Receive<DeadlineElapsed>(HandleDeadline);
}
/// <summary>
/// On startup recover any deployment that was mid-flight when a prior singleton instance
/// died. We re-derive <c>_expectedAcks</c> from <c>NodeDeploymentState</c>, replay the ACKs
/// that already landed in the DB, and resume the deadline timer.
/// </summary>
protected override void PreStart()
{
using var db = _dbFactory.CreateDbContext();
var inflight = db.Deployments
.Where(d => d.Status == DeploymentStatus.Dispatching || d.Status == DeploymentStatus.AwaitingApplyAcks)
.OrderByDescending(d => d.CreatedAtUtc)
.FirstOrDefault();
if (inflight is null) return;
_current = new DeploymentId(inflight.DeploymentId);
var nodeStates = db.NodeDeploymentStates
.Where(x => x.DeploymentId == inflight.DeploymentId)
.AsNoTracking()
.ToList();
_expectedAcks = nodeStates.Select(s => NodeId.Parse(s.NodeId)).ToHashSet();
foreach (var s in nodeStates.Where(s => s.Status != NodeDeploymentStatus.Applying))
_acks[NodeId.Parse(s.NodeId)] = s.Status == NodeDeploymentStatus.Applied
? ApplyAckOutcome.Applied
: ApplyAckOutcome.Failed;
// Resume the deadline timer using the remaining time. The deadline runs from when the
// deployment was first marked AwaitingApplyAcks (Deployment.CreatedAtUtc is a close enough
// proxy — we don't track a separate "dispatched at" column).
var elapsed = DateTime.UtcNow - inflight.CreatedAtUtc;
var remaining = _applyDeadline - elapsed;
if (remaining <= TimeSpan.Zero)
{
Self.Tell(new DeadlineElapsed(_current.Value));
}
else
{
Timers.StartSingleTimer(DeadlineTimerKey, new DeadlineElapsed(_current.Value), remaining);
}
_log.Info("Coordinator recovered in-flight deployment {Id} ({Acked}/{Total} acks landed)",
_current, _acks.Count, _expectedAcks.Count);
}
private void HandleDispatch(DispatchDeployment msg)
@@ -65,6 +118,7 @@ public sealed class ConfigPublishCoordinator : ReceiveActor
}
DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish(DeploymentsTopic, msg));
Timers.StartSingleTimer(DeadlineTimerKey, new DeadlineElapsed(msg.DeploymentId), _applyDeadline);
if (_expectedAcks.Count == 0)
{
@@ -132,13 +186,40 @@ public sealed class ConfigPublishCoordinator : ReceiveActor
ResetForNext();
}
private void HandleDeadline(DeadlineElapsed msg)
{
if (_current is null || msg.DeploymentId != _current.Value)
{
_log.Debug("Discarding stale DeadlineElapsed for {Id} (current={Current})",
msg.DeploymentId, _current);
return;
}
if (_acks.Count == _expectedAcks.Count)
{
// Race: every node acked just as the deadline fired. Already sealed/failed elsewhere.
return;
}
using var db = _dbFactory.CreateDbContext();
UpdateDeploymentStatus(db, _current.Value, DeploymentStatus.TimedOut);
db.SaveChanges();
_log.Warning("Deployment {Id} timed out after {Deadline} ({Acked}/{Total} acks landed)",
_current.Value, _applyDeadline, _acks.Count, _expectedAcks.Count);
ResetForNext();
}
private void ResetForNext()
{
Timers.Cancel(DeadlineTimerKey);
_current = null;
_expectedAcks.Clear();
_acks.Clear();
}
private const string DeadlineTimerKey = "apply-deadline";
public sealed record DeadlineElapsed(DeploymentId DeploymentId);
private static void UpdateDeploymentStatus(
OtOpcUaConfigDbContext db, DeploymentId id, DeploymentStatus status, bool sealNow = false)
{