feat(adminui): Reconnect/Restart on DriverStatusPanel (DriverOperator-gated)

- RestartDriver / ReconnectDriver messages + AdminOperationsActor
  handlers (broadcast via driver-control DPS topic; audited via
  ConfigEdits).
- DriverHostActor subscribes to driver-control; locates the
  matching child DriverInstanceActor and stops+respawns it
  (Restart) or sends it a ForceReconnect internal message
  (Reconnect — re-enters Reconnecting state without full stop).
  DriverInstanceSpec constructor call uses named args to handle
  the full 6-parameter signature.
- New DriverOperator authorization policy mapped to DriverOperator
  or FleetAdmin role; documented in docs/security.md. Map LDAP
  group via GroupToRole (e.g. "ot-driver-operator": "DriverOperator").
- DriverStatusPanel renders Reconnect + Restart buttons when the
  user holds the DriverOperator policy (hidden otherwise). Restart
  requires an in-page Razor confirm block (no JS confirm, keeps
  SignalR event loop unblocked). Both buttons show a spinner and
  are disabled during in-flight; result chip auto-clears after 8s.
  Username sourced from AuthenticationStateProvider.

Reconnect resolves to "ForceReconnect" (re-enter Reconnecting,
not full stop+respawn) — transport drops and retries while actor
and in-memory state are preserved. All DriverInstanceActor states
handle ForceReconnect safely (no-op when already in transition).
This commit is contained in:
Joseph Doherty
2026-05-28 11:14:04 -04:00
parent 4b374fd177
commit ffcc8d1065
8 changed files with 333 additions and 2 deletions
+2 -1
View File
@@ -251,7 +251,8 @@ The `AdminRole` enum (`src/Core/ZB.MOM.WW.OtOpcUa.Configuration/Enums/AdminRole.
|---|---|
| `ConfigViewer` | Read-only access to drafts, generations, audit log, fleet status. |
| `ConfigEditor` | ConfigViewer plus draft editing (UNS, equipment, tags, ACLs, driver instances, reservations, CSV imports). Cannot publish. |
| `FleetAdmin` | ConfigEditor plus publish, cluster/node CRUD, credential management, role-grant management. |
| `FleetAdmin` | ConfigEditor plus publish, cluster/node CRUD, credential management, role-grant management. Also satisfies the `DriverOperator` authorization policy. |
| `DriverOperator` | May issue **Reconnect** and **Restart** commands against live driver instances from the Admin UI `DriverStatusPanel`. Gated by the `DriverOperator` named policy in `AddAuthorization` (`src/Server/ZB.MOM.WW.OtOpcUa.Security/ServiceCollectionExtensions.cs`). Map an LDAP group via `GroupToRole`, e.g. `"ot-driver-operator": "DriverOperator"`. |
In v2 the authentication + authorization stack is wired centrally by `AddOtOpcUaAuth` (`src/Server/ZB.MOM.WW.OtOpcUa.Security/ServiceCollectionExtensions.cs`) and Razor pages gate inline with the role names, e.g. `@attribute [Authorize(Roles = "FleetAdmin,ConfigEditor")]` on `Deployments.razor`. Nav-menu sections hide via `<AuthorizeView>`.
@@ -0,0 +1,25 @@
namespace ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
/// <summary>
/// AdminUI → AdminOperationsActor: reconnect the driver actor's transport without
/// respawning the actor itself. Sends the actor back through its Reconnecting state —
/// fast, preserves in-memory state. The driver actor's supervisor performs the work.
/// </summary>
/// <param name="ClusterId">Cluster scope identifier (for audit).</param>
/// <param name="DriverInstanceId">The driver instance to reconnect.</param>
/// <param name="ActorByUserName">The authenticated admin user who triggered the reconnect.</param>
/// <param name="CorrelationId">Round-trip correlation token.</param>
public sealed record ReconnectDriver(
string ClusterId,
string DriverInstanceId,
string ActorByUserName,
Guid CorrelationId);
/// <summary>Reply for <see cref="ReconnectDriver"/>.</summary>
/// <param name="Ok">True iff the operation was dispatched without error.</param>
/// <param name="Message">Failure reason; null on success.</param>
/// <param name="CorrelationId">Echoes the request's correlation token.</param>
public sealed record ReconnectDriverResult(
bool Ok,
string? Message,
Guid CorrelationId);
@@ -0,0 +1,25 @@
namespace ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
/// <summary>
/// AdminUI → AdminOperationsActor: restart the driver actor for one instance.
/// A restart fully stops and respawns the actor — loses in-memory state, may briefly
/// interrupt active subscriptions. The driver actor's supervisor performs the work.
/// </summary>
/// <param name="ClusterId">Cluster scope identifier (for audit).</param>
/// <param name="DriverInstanceId">The driver instance to restart.</param>
/// <param name="ActorByUserName">The authenticated admin user who triggered the restart.</param>
/// <param name="CorrelationId">Round-trip correlation token.</param>
public sealed record RestartDriver(
string ClusterId,
string DriverInstanceId,
string ActorByUserName,
Guid CorrelationId);
/// <summary>Reply for <see cref="RestartDriver"/>.</summary>
/// <param name="Ok">True iff the operation was dispatched without error.</param>
/// <param name="Message">Failure reason; null on success.</param>
/// <param name="CorrelationId">Echoes the request's correlation token.</param>
public sealed record RestartDriverResult(
bool Ok,
string? Message,
Guid CorrelationId);
@@ -1,11 +1,17 @@
@* Live driver-status panel — subscribes to /hubs/driverstatus and shows state chip,
last-success age, 5-min error count, and last error message.
Enabled=false renders a static "Disabled" notice and never opens the hub.
Reconnect/Restart buttons are Phase 8 (Task 8.3). *@
DriverOperator-gated Reconnect/Restart buttons appear for authorised users. *@
@implements IAsyncDisposable
@using Microsoft.AspNetCore.Authorization
@using Microsoft.AspNetCore.SignalR.Client
@using ZB.MOM.WW.OtOpcUa.Commons.Interfaces
@using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin
@using ZB.MOM.WW.OtOpcUa.Commons.Messages.Drivers
@inject NavigationManager Nav
@inject AuthenticationStateProvider AuthState
@inject IAuthorizationService AuthorizationService
@inject IAdminOperationsClient AdminOps
<section class="panel rise mt-3" style="animation-delay:.04s; @(_stale ? "opacity:0.5;" : "")">
<div class="panel-head d-flex align-items-center gap-2">
@@ -66,11 +72,71 @@
</details>
}
}
@* --- Reconnect / Restart action buttons (DriverOperator-gated) --- *@
@if (_canOperate && Enabled)
{
<div class="d-flex gap-2 align-items-center mt-3">
<button type="button"
class="btn btn-sm btn-outline-secondary"
disabled="@_busyReconnect"
@onclick="ReconnectAsync"
title="Re-establish driver transport without restarting the actor">
@if (_busyReconnect)
{
<span class="spinner-border spinner-border-sm me-1"></span>
<span>Reconnecting&hellip;</span>
}
else
{
<span>Reconnect</span>
}
</button>
<button type="button"
class="btn btn-sm btn-outline-danger"
disabled="@_busyRestart"
@onclick="() => _showRestartConfirm = true"
title="Stop and respawn the driver actor — briefly interrupts active subscriptions">
@if (_busyRestart)
{
<span class="spinner-border spinner-border-sm me-1"></span>
<span>Restarting&hellip;</span>
}
else
{
<span>Restart</span>
}
</button>
@if (_opResultMessage is not null)
{
<span class="chip @(_opResultOk ? "chip-ok" : "chip-bad")" style="font-size:0.8rem">@_opResultMessage</span>
}
</div>
@* Inline confirm dialog for Restart (no JS confirm — keeps SignalR event loop unblocked) *@
@if (_showRestartConfirm)
{
<div class="mt-2 p-2 border rounded" style="background:var(--surface-raised,#fff); border-color:var(--bad,#dc3545)!important; max-width:420px; font-size:0.9rem">
<p class="mb-2" style="color:var(--ink)">
Restart driver <code>@DriverInstanceId</code>?<br />
<span style="color:var(--ink-soft)">This briefly interrupts active subscriptions and clears in-memory state.</span>
</p>
<div class="d-flex gap-2">
<button type="button" class="btn btn-sm btn-danger" @onclick="RestartConfirmedAsync">Confirm restart</button>
<button type="button" class="btn btn-sm btn-outline-secondary" @onclick="() => _showRestartConfirm = false">Cancel</button>
</div>
</div>
}
}
</div>
</section>
@code {
[Parameter, EditorRequired] public string DriverInstanceId { get; set; } = "";
/// <summary>Cluster identifier forwarded in Reconnect/Restart messages for audit.</summary>
[Parameter] public string ClusterId { get; set; } = "";
[Parameter] public bool Enabled { get; set; } = true;
private HubConnection? _hub;
@@ -81,8 +147,26 @@
private string? _error;
private System.Threading.Timer? _timer;
// Authorization
private bool _canOperate;
private string? _currentUserName;
// Action state
private bool _busyReconnect;
private bool _busyRestart;
private bool _showRestartConfirm;
private string? _opResultMessage;
private bool _opResultOk;
private System.Timers.Timer? _opResultClearTimer;
protected override async Task OnInitializedAsync()
{
// Check DriverOperator authorization so buttons only render for permitted users.
var auth = await AuthState.GetAuthenticationStateAsync();
_currentUserName = auth.User.Identity?.Name ?? auth.User.FindFirst(System.Security.Claims.ClaimTypes.NameIdentifier)?.Value ?? "unknown";
var authResult = await AuthorizationService.AuthorizeAsync(auth.User, null, "DriverOperator");
_canOperate = authResult.Succeeded;
if (!Enabled)
return;
@@ -122,9 +206,72 @@
}
}
private async Task ReconnectAsync()
{
_busyReconnect = true;
_opResultMessage = null;
StateHasChanged();
try
{
var result = await AdminOps.AskAsync<ReconnectDriverResult>(
new ReconnectDriver(ClusterId, DriverInstanceId, _currentUserName ?? "unknown", Guid.NewGuid()),
new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(15)).Token);
ShowOpResult(result.Ok, result.Ok ? "Reconnect dispatched" : (result.Message ?? "Failed"));
}
catch (Exception ex)
{
ShowOpResult(false, ex.Message.Length > 60 ? ex.Message[..60] + "…" : ex.Message);
}
finally
{
_busyReconnect = false;
StateHasChanged();
}
}
private async Task RestartConfirmedAsync()
{
_showRestartConfirm = false;
_busyRestart = true;
_opResultMessage = null;
StateHasChanged();
try
{
var result = await AdminOps.AskAsync<RestartDriverResult>(
new RestartDriver(ClusterId, DriverInstanceId, _currentUserName ?? "unknown", Guid.NewGuid()),
new System.Threading.CancellationTokenSource(TimeSpan.FromSeconds(15)).Token);
ShowOpResult(result.Ok, result.Ok ? "Restart dispatched" : (result.Message ?? "Failed"));
}
catch (Exception ex)
{
ShowOpResult(false, ex.Message.Length > 60 ? ex.Message[..60] + "…" : ex.Message);
}
finally
{
_busyRestart = false;
StateHasChanged();
}
}
private void ShowOpResult(bool ok, string message)
{
_opResultOk = ok;
_opResultMessage = message;
// Auto-clear the result chip after 8 s.
_opResultClearTimer?.Dispose();
_opResultClearTimer = new System.Timers.Timer(8_000) { AutoReset = false };
_opResultClearTimer.Elapsed += async (_, _) =>
{
_opResultMessage = null;
await InvokeAsync(StateHasChanged);
};
_opResultClearTimer.Start();
}
public async ValueTask DisposeAsync()
{
_timer?.Dispose();
_opResultClearTimer?.Dispose();
if (_hub is not null)
await _hub.DisposeAsync();
}
@@ -1,4 +1,5 @@
using Akka.Actor;
using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
@@ -49,6 +50,8 @@ public sealed class AdminOperationsActor : ReceiveActor
ReceiveAsync<StartDeployment>(HandleStartDeploymentAsync);
ReceiveAsync<TestDriverConnect>(HandleTestDriverConnectAsync);
ReceiveAsync<RestartDriver>(HandleRestartDriverAsync);
ReceiveAsync<ReconnectDriver>(HandleReconnectDriverAsync);
}
private async Task HandleStartDeploymentAsync(StartDeployment msg)
@@ -167,4 +170,65 @@ public sealed class AdminOperationsActor : ReceiveActor
msg.CorrelationId));
}
}
private async Task HandleRestartDriverAsync(RestartDriver msg)
{
var replyTo = Sender;
try
{
// Broadcast to every DriverHostActor on every node via the driver-control DPS topic.
// Only the host that owns the instance will act; others ignore it (id not found in _children).
DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish("driver-control", msg));
await using var db = await _dbFactory.CreateDbContextAsync();
db.ConfigEdits.Add(new ConfigEdit
{
EntityType = "DriverInstance",
EntityId = Guid.TryParse(msg.DriverInstanceId, out var guid) ? guid : Guid.Empty,
FieldsJson = $"{{\"op\":\"restart\",\"driverInstanceId\":{System.Text.Json.JsonSerializer.Serialize(msg.DriverInstanceId)}}}",
EditedBy = msg.ActorByUserName,
SourceNode = Akka.Cluster.Cluster.Get(Context.System).SelfAddress.Host ?? "unknown",
});
await db.SaveChangesAsync();
_log.Info("AdminOps: RestartDriver dispatched for {DriverInstanceId} by {User}",
msg.DriverInstanceId, msg.ActorByUserName);
replyTo.Tell(new RestartDriverResult(true, null, msg.CorrelationId));
}
catch (Exception ex)
{
_log.Error(ex, "AdminOps: RestartDriver failed for {DriverInstanceId}", msg.DriverInstanceId);
replyTo.Tell(new RestartDriverResult(false, ex.Message, msg.CorrelationId));
}
}
private async Task HandleReconnectDriverAsync(ReconnectDriver msg)
{
var replyTo = Sender;
try
{
// Broadcast to every DriverHostActor; only the one owning the instance reacts.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Publish("driver-control", msg));
await using var db = await _dbFactory.CreateDbContextAsync();
db.ConfigEdits.Add(new ConfigEdit
{
EntityType = "DriverInstance",
EntityId = Guid.TryParse(msg.DriverInstanceId, out var guid) ? guid : Guid.Empty,
FieldsJson = $"{{\"op\":\"reconnect\",\"driverInstanceId\":{System.Text.Json.JsonSerializer.Serialize(msg.DriverInstanceId)}}}",
EditedBy = msg.ActorByUserName,
SourceNode = Akka.Cluster.Cluster.Get(Context.System).SelfAddress.Host ?? "unknown",
});
await db.SaveChangesAsync();
_log.Info("AdminOps: ReconnectDriver dispatched for {DriverInstanceId} by {User}",
msg.DriverInstanceId, msg.ActorByUserName);
replyTo.Tell(new ReconnectDriverResult(true, null, msg.CorrelationId));
}
catch (Exception ex)
{
_log.Error(ex, "AdminOps: ReconnectDriver failed for {DriverInstanceId}", msg.DriverInstanceId);
replyTo.Tell(new ReconnectDriverResult(false, ex.Message, msg.CorrelationId));
}
}
}
@@ -4,6 +4,7 @@ using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Interfaces;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Admin;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
@@ -36,6 +37,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
{
public const string DeploymentsTopic = "deployments";
public const string DeploymentAcksTopic = "deployment-acks";
public const string DriverControlTopic = "driver-control";
public static readonly TimeSpan ReconnectInterval = TimeSpan.FromSeconds(30);
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _dbFactory;
@@ -123,6 +125,8 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
{
// Subscribe to deployments topic so the coordinator's broadcast lands here.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Subscribe(DeploymentsTopic, Self));
// Subscribe to driver-control topic so AdminUI Reconnect/Restart commands land here.
DistributedPubSub.Get(Context.System).Mediator.Tell(new Subscribe(DriverControlTopic, Self));
Bootstrap();
}
@@ -187,6 +191,8 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
Receive<DispatchDeployment>(HandleDispatchFromSteady);
Receive<GetDiagnostics>(HandleGetDiagnostics);
Receive<DriverInstanceActor.AttributeValuePublished>(ForwardToMux);
Receive<RestartDriver>(HandleRestartDriver);
Receive<ReconnectDriver>(HandleReconnectDriver);
Receive<SubscribeAck>(_ => { /* PubSub ack */ });
}
@@ -206,6 +212,8 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
});
Receive<GetDiagnostics>(HandleGetDiagnostics);
Receive<DriverInstanceActor.AttributeValuePublished>(ForwardToMux);
Receive<RestartDriver>(HandleRestartDriver);
Receive<ReconnectDriver>(HandleReconnectDriver);
Receive<SubscribeAck>(_ => { /* PubSub ack */ });
}
@@ -225,6 +233,8 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
});
Receive<GetDiagnostics>(HandleGetDiagnostics);
Receive<RetryConfigDbConnection>(_ => TryRecoverFromStale());
Receive<RestartDriver>(HandleRestartDriver);
Receive<ReconnectDriver>(HandleReconnectDriver);
Receive<SubscribeAck>(_ => { /* PubSub ack */ });
Timers.StartPeriodicTimer("retry-db", RetryConfigDbConnection.Instance, ReconnectInterval);
}
@@ -444,6 +454,42 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
private void HandleRestartDriver(RestartDriver msg)
{
// DPS broadcast — only act if this node hosts the requested instance.
if (!_children.TryGetValue(msg.DriverInstanceId, out var entry))
return;
_log.Info("DriverHost {Node}: restarting driver {Id} by request of {User}",
_localNode, msg.DriverInstanceId, msg.ActorByUserName);
// Stop the existing child actor — DriverInstanceActor.PostStop calls ShutdownAsync.
Context.Stop(entry.Actor);
_children.Remove(msg.DriverInstanceId);
// Respawn using the same spec that was applied during the last reconcile.
SpawnChild(new DriverInstanceSpec(
DriverInstanceRowId: Guid.Empty,
DriverInstanceId: msg.DriverInstanceId,
Name: msg.DriverInstanceId,
DriverType: entry.DriverType,
Enabled: true,
DriverConfig: entry.LastConfigJson));
}
private void HandleReconnectDriver(ReconnectDriver msg)
{
// DPS broadcast — only act if this node hosts the requested instance.
if (!_children.TryGetValue(msg.DriverInstanceId, out var entry))
return;
_log.Info("DriverHost {Node}: reconnecting driver {Id} by request of {User}",
_localNode, msg.DriverInstanceId, msg.ActorByUserName);
// Tell the child to drop its transport and re-enter the Reconnecting state.
entry.Actor.Tell(new DriverInstanceActor.ForceReconnect());
}
private void TryRecoverFromStale()
{
try
@@ -44,6 +44,13 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
public sealed record SubscriptionEstablished(string DiagnosticId, int ReferenceCount);
public sealed record SubscriptionFailed(string Reason);
public sealed record Unsubscribe;
/// <summary>
/// Sent by <see cref="DriverHostActor"/> when the AdminUI issues a Reconnect operation.
/// Pushes the actor out of <c>Connected</c> into <c>Reconnecting</c> so the transport is
/// re-established without fully stopping and respawning the actor. Safe to send in any
/// state — a no-op when already Reconnecting or Connecting.
/// </summary>
public sealed record ForceReconnect;
/// <summary>Published to the actor's parent whenever the subscribed IDriver fires
/// <see cref="ISubscribable.OnDataChange"/>. The parent forwards to OpcUaPublishActor.</summary>
public sealed record AttributeValuePublished(string FullReference, object? Value, OpcUaQuality Quality, DateTime TimestampUtc);
@@ -178,6 +185,7 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
Receive<ApplyDelta>(msg => Sender.Tell(new ApplyResult(true, "stubbed", msg.Correlation)));
Receive<WriteAttribute>(_ => Sender.Tell(new WriteAttributeResult(true, "stubbed")));
Receive<DisconnectObserved>(_ => { /* stubbed drivers don't disconnect */ });
Receive<ForceReconnect>(_ => { /* stubbed drivers don't reconnect */ });
Receive<HealthPollTick>(_ => PublishHealthSnapshot());
}
@@ -197,6 +205,7 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
Become(Reconnecting);
PublishHealthSnapshot();
});
Receive<ForceReconnect>(_ => { /* already connecting — no-op */ });
Receive<HealthPollTick>(_ => PublishHealthSnapshot());
}
@@ -212,6 +221,13 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
Become(Reconnecting);
PublishHealthSnapshot();
});
Receive<ForceReconnect>(_ =>
{
_log.Info("DriverInstance {Id}: ForceReconnect requested by admin; re-entering Reconnecting", _driverInstanceId);
DetachSubscription();
Become(Reconnecting);
PublishHealthSnapshot();
});
ReceiveAsync<WriteAttribute>(HandleWriteAsync);
ReceiveAsync<Subscribe>(HandleSubscribeAsync);
ReceiveAsync<Unsubscribe>(_ => UnsubscribeAsync());
@@ -230,6 +246,7 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
PublishHealthSnapshot();
});
Receive<InitializeFailed>(_ => { /* keep retrying via timer */ });
Receive<ForceReconnect>(_ => { /* already reconnecting — no-op */ });
Receive<HealthPollTick>(_ => PublishHealthSnapshot());
Timers.StartPeriodicTimer("retry-connect", RetryConnect.Instance, _reconnectInterval);
}
@@ -89,6 +89,12 @@ public static class ServiceCollectionExtensions
JwtBearerDefaults.AuthenticationScheme)
.RequireAuthenticatedUser()
.Build();
// DriverOperator: may issue Reconnect/Restart commands against live driver instances
// from the Admin UI DriverStatusPanel. Map LDAP group → role via GroupToRole in
// appsettings (e.g. "ot-driver-operator": "DriverOperator").
o.AddPolicy("DriverOperator", policy =>
policy.RequireRole("DriverOperator", "FleetAdmin"));
});
return services;