1388 lines
63 KiB
C#
1388 lines
63 KiB
C#
using Akka.Actor;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.DebugView;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.InboundApi;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Instance;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Lifecycle;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.ScriptExecution;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
|
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
|
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
|
|
|
/// <summary>
|
|
/// Site-side Deployment Manager — runs as a cluster singleton within the site cluster.
|
|
/// On startup, reads all deployed configs from SQLite and creates Instance Actors
|
|
/// for enabled instances in staggered batches.
|
|
///
|
|
/// Handles: DeployInstance, DisableInstance, EnableInstance, DeleteInstance.
|
|
///
|
|
/// Supervision strategy: OneForOneStrategy with Resume for Instance Actors
|
|
/// so that a single instance failure does not cascade to siblings.
|
|
/// </summary>
|
|
public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
|
{
|
|
private readonly SiteStorageService _storage;
|
|
private readonly ScriptCompilationService _compilationService;
|
|
private readonly SharedScriptLibrary _sharedScriptLibrary;
|
|
private readonly SiteStreamManager? _streamManager;
|
|
private readonly SiteRuntimeOptions _options;
|
|
private readonly ILogger<DeploymentManagerActor> _logger;
|
|
/// <summary>
|
|
/// Shared logger factory used to mint <see cref="InstanceActor"/> loggers
|
|
/// (SiteRuntime-015). Reused across every <see cref="CreateInstanceActor"/>
|
|
/// call rather than newing a per-instance factory that is never disposed.
|
|
/// When the host injects its configured factory the Instance Actor logs are
|
|
/// routed through the application's logging providers.
|
|
/// </summary>
|
|
private readonly ILoggerFactory _loggerFactory;
|
|
private readonly IActorRef? _dclManager;
|
|
private readonly IActorRef? _replicationActor;
|
|
private readonly ISiteHealthCollector? _healthCollector;
|
|
private readonly IServiceProvider? _serviceProvider;
|
|
private readonly Dictionary<string, IActorRef> _instanceActors = new();
|
|
/// <summary>
|
|
/// Tracks Instance Actors that are terminating as part of a redeployment, keyed by
|
|
/// the terminating actor ref. The buffered command is applied once <see cref="Terminated"/>
|
|
/// confirms the child has fully stopped (SiteRuntime-003).
|
|
/// </summary>
|
|
private readonly Dictionary<IActorRef, PendingRedeploy> _pendingRedeploys = new();
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-020: name → terminating actor ref shadow of <see cref="_pendingRedeploys"/>.
|
|
/// Required because a third <see cref="DeployInstanceCommand"/> for the same instance
|
|
/// arriving WHILE a redeploy is still mid-termination would otherwise see
|
|
/// <c>_instanceActors.TryGetValue == false</c> and fall through to
|
|
/// <see cref="ApplyDeployment"/> + <see cref="CreateInstanceActor"/>, where
|
|
/// <c>Context.ActorOf(props, instanceName)</c> throws <c>InvalidActorNameException</c>
|
|
/// — the child name is still registered until the <see cref="Terminated"/> signal fires.
|
|
/// The supervisor's Stop directive then drops the deploy command silently, leaving the
|
|
/// deployer waiting forever and persistence dangling. The shadow index lets
|
|
/// <see cref="HandleDeploy"/> detect the mid-termination state and overwrite the
|
|
/// buffered pending command (last-write-wins) instead of trying to create a fresh actor.
|
|
/// Cleared in <see cref="HandleTerminated"/> alongside <see cref="_pendingRedeploys"/>.
|
|
/// </summary>
|
|
private readonly Dictionary<string, IActorRef> _terminatingActorsByName = new();
|
|
private int _totalDeployedCount;
|
|
|
|
/// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
|
|
public ITimerScheduler Timers { get; set; } = null!;
|
|
|
|
/// <summary>Initializes the actor, wires all message handlers, and prepares for staggered Instance Actor startup.</summary>
|
|
/// <param name="storage">Site SQLite storage service for deployed configuration persistence.</param>
|
|
/// <param name="compilationService">Script compilation service used when creating Instance Actors.</param>
|
|
/// <param name="sharedScriptLibrary">Shared script library compiled before Instance Actors are created.</param>
|
|
/// <param name="streamManager">Optional site-wide stream manager; null in unit tests.</param>
|
|
/// <param name="options">Site runtime options (startup batch size, stagger interval, etc.).</param>
|
|
/// <param name="logger">Logger instance.</param>
|
|
/// <param name="dclManager">Optional Data Connection Layer manager actor.</param>
|
|
/// <param name="replicationActor">Optional standby replication actor.</param>
|
|
/// <param name="healthCollector">Optional site health collector.</param>
|
|
/// <param name="serviceProvider">Optional service provider for resolving per-instance services.</param>
|
|
/// <param name="loggerFactory">Optional logger factory for creating Instance Actor loggers.</param>
|
|
public DeploymentManagerActor(
|
|
SiteStorageService storage,
|
|
ScriptCompilationService compilationService,
|
|
SharedScriptLibrary sharedScriptLibrary,
|
|
SiteStreamManager? streamManager,
|
|
SiteRuntimeOptions options,
|
|
ILogger<DeploymentManagerActor> logger,
|
|
IActorRef? dclManager = null,
|
|
IActorRef? replicationActor = null,
|
|
ISiteHealthCollector? healthCollector = null,
|
|
IServiceProvider? serviceProvider = null,
|
|
ILoggerFactory? loggerFactory = null)
|
|
{
|
|
_storage = storage;
|
|
_compilationService = compilationService;
|
|
_sharedScriptLibrary = sharedScriptLibrary;
|
|
_streamManager = streamManager;
|
|
_options = options;
|
|
_dclManager = dclManager;
|
|
_replicationActor = replicationActor;
|
|
_healthCollector = healthCollector;
|
|
_serviceProvider = serviceProvider;
|
|
_logger = logger;
|
|
// SiteRuntime-015: reuse a single logger factory for all Instance Actors.
|
|
// Prefer an explicitly injected factory, fall back to one resolved from
|
|
// the service provider, and only as a last resort use NullLoggerFactory —
|
|
// never a per-instance `new LoggerFactory()` that leaks undisposed.
|
|
_loggerFactory = loggerFactory
|
|
?? serviceProvider?.GetService(typeof(ILoggerFactory)) as ILoggerFactory
|
|
?? Microsoft.Extensions.Logging.Abstractions.NullLoggerFactory.Instance;
|
|
|
|
// Lifecycle commands
|
|
Receive<DeployInstanceCommand>(HandleDeploy);
|
|
Receive<DisableInstanceCommand>(HandleDisable);
|
|
Receive<EnableInstanceCommand>(HandleEnable);
|
|
Receive<DeleteInstanceCommand>(HandleDelete);
|
|
|
|
// DeploymentManager-006: query-the-site-before-redeploy idempotency.
|
|
// Central asks for the instance's currently-applied deployment identity
|
|
// before re-sending a deployment whose prior record is stuck InProgress
|
|
// or Failed due to a timeout.
|
|
Receive<DeploymentStateQueryRequest>(HandleDeploymentStateQuery);
|
|
|
|
// WP-33: Handle system-wide artifact deployment
|
|
Receive<DeployArtifactsCommand>(HandleDeployArtifacts);
|
|
|
|
// SiteRuntime-021: artifact-deploy DCL push, dispatched back from the
|
|
// off-thread persistence task so the hash-cache mutation stays
|
|
// actor-thread-confined.
|
|
Receive<ApplyArtifactDataConnectionsToDcl>(HandleApplyArtifactDataConnectionsToDcl);
|
|
|
|
// Debug View — route to Instance Actors
|
|
Receive<SubscribeDebugViewRequest>(RouteDebugViewSubscribe);
|
|
Receive<UnsubscribeDebugViewRequest>(RouteDebugViewUnsubscribe);
|
|
Receive<DebugSnapshotRequest>(RouteDebugSnapshot);
|
|
|
|
// Inbound API Route.To().Call() — route to Instance Actors
|
|
Receive<RouteToCallRequest>(RouteInboundApiCall);
|
|
Receive<RouteToGetAttributesRequest>(RouteInboundApiGetAttributes);
|
|
Receive<RouteToSetAttributesRequest>(RouteInboundApiSetAttributes);
|
|
Receive<RouteToWaitForAttributeRequest>(RouteInboundApiWaitForAttribute);
|
|
|
|
// OPC UA Tag Browser — singleton-only re-forward to local /user/dcl-manager.
|
|
// BrowseNodeCommand is routed to this singleton (active node) by
|
|
// SiteCommunicationActor so the dcl-manager we forward to is guaranteed
|
|
// to be the one holding the live DataConnectionActor children. ActorSelection
|
|
// has no Forward() extension in this Akka.NET version, so we Tell with the
|
|
// original Sender preserved (semantically identical to Forward).
|
|
Receive<BrowseNodeCommand>(msg =>
|
|
Context.ActorSelection("/user/dcl-manager").Tell(msg, Sender));
|
|
|
|
// Test Bindings — same singleton-only re-forward as the browse handler
|
|
// above. Routed to this singleton (active node) by SiteCommunicationActor
|
|
// so the dcl-manager we forward to is guaranteed to hold the live
|
|
// DataConnectionActor children.
|
|
Receive<ReadTagValuesCommand>(msg =>
|
|
Context.ActorSelection("/user/dcl-manager").Tell(msg, Sender));
|
|
|
|
// Internal startup messages
|
|
Receive<StartupConfigsLoaded>(HandleStartupConfigsLoaded);
|
|
Receive<SharedScriptsLoaded>(HandleSharedScriptsLoaded);
|
|
Receive<StartNextBatch>(HandleStartNextBatch);
|
|
|
|
// Internal enable result
|
|
Receive<EnableResult>(HandleEnableResult);
|
|
|
|
// Internal deploy persistence result
|
|
Receive<DeployPersistenceResult>(HandleDeployPersistenceResult);
|
|
|
|
// Terminated signal — drains a buffered redeployment once the previous
|
|
// Instance Actor has fully stopped (SiteRuntime-003).
|
|
Receive<Terminated>(HandleTerminated);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PreStart()
|
|
{
|
|
base.PreStart();
|
|
_healthCollector?.SetActiveNode(true);
|
|
_logger.LogInformation("DeploymentManagerActor starting — loading deployed configs from SQLite...");
|
|
|
|
// Load all configs asynchronously and pipe to self
|
|
_storage.GetAllDeployedConfigsAsync().ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
return new StartupConfigsLoaded(t.Result, null);
|
|
return new StartupConfigsLoaded([], t.Exception?.GetBaseException().Message);
|
|
}).PipeTo(Self);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PostStop()
|
|
{
|
|
_healthCollector?.SetActiveNode(false);
|
|
base.PostStop();
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override SupervisorStrategy SupervisorStrategy()
|
|
{
|
|
return new OneForOneStrategy(
|
|
maxNrOfRetries: -1,
|
|
withinTimeRange: TimeSpan.FromMinutes(1),
|
|
decider: Decider.From(ex =>
|
|
{
|
|
if (ex is ActorInitializationException)
|
|
{
|
|
_logger.LogError(ex, "Instance Actor failed to initialize, stopping");
|
|
return Directive.Stop;
|
|
}
|
|
|
|
_logger.LogWarning(ex, "Instance Actor threw exception, resuming");
|
|
return Directive.Resume;
|
|
}));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Processes the loaded configs from SQLite.
|
|
///
|
|
/// SiteRuntime-008: shared scripts must be compiled before Instance Actors are
|
|
/// created, but the SQLite read and Roslyn compilation must not block the
|
|
/// singleton's mailbox. The compilation is run on a background task and a
|
|
/// <see cref="SharedScriptsLoaded"/> message is piped back; only then does
|
|
/// staggered Instance Actor creation begin. The deployed configs are stashed on the
|
|
/// actor field in the meantime.
|
|
/// </summary>
|
|
private void HandleStartupConfigsLoaded(StartupConfigsLoaded msg)
|
|
{
|
|
if (msg.Error != null)
|
|
{
|
|
_logger.LogError("Failed to load deployed configs: {Error}", msg.Error);
|
|
return;
|
|
}
|
|
|
|
var enabledConfigs = msg.Configs.Where(c => c.IsEnabled).ToList();
|
|
_totalDeployedCount = msg.Configs.Count;
|
|
_logger.LogInformation(
|
|
"Loaded {Total} deployed configs ({Enabled} enabled) from SQLite",
|
|
msg.Configs.Count, enabledConfigs.Count);
|
|
UpdateInstanceCounts();
|
|
|
|
// Load and compile shared scripts off the actor thread, then resume startup.
|
|
LoadSharedScriptsFromStorage(enabledConfigs);
|
|
}
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-008: once shared scripts have been compiled off-thread, begins
|
|
/// staggered Instance Actor creation for the enabled configs captured at startup.
|
|
/// </summary>
|
|
private void HandleSharedScriptsLoaded(SharedScriptsLoaded msg)
|
|
{
|
|
_logger.LogInformation(
|
|
"Loaded {Compiled}/{Total} shared scripts from SQLite",
|
|
msg.CompiledCount, msg.TotalCount);
|
|
|
|
if (msg.EnabledConfigs.Count == 0)
|
|
return;
|
|
|
|
// Start the first batch immediately
|
|
var batchState = new BatchState(msg.EnabledConfigs, 0);
|
|
Self.Tell(new StartNextBatch(batchState));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates Instance Actors in batches with a configurable delay between batches
|
|
/// to prevent reconnection storms on failover.
|
|
/// </summary>
|
|
private void HandleStartNextBatch(StartNextBatch msg)
|
|
{
|
|
var state = msg.State;
|
|
var batchSize = _options.StartupBatchSize;
|
|
var startIdx = state.NextIndex;
|
|
var endIdx = Math.Min(startIdx + batchSize, state.Configs.Count);
|
|
|
|
_logger.LogDebug(
|
|
"Creating Instance Actors batch [{Start}..{End}) of {Total}",
|
|
startIdx, endIdx, state.Configs.Count);
|
|
|
|
for (var i = startIdx; i < endIdx; i++)
|
|
{
|
|
var config = state.Configs[i];
|
|
EnsureDclConnections(config.ConfigJson);
|
|
CreateInstanceActor(config.InstanceUniqueName, config.ConfigJson);
|
|
}
|
|
|
|
UpdateInstanceCounts();
|
|
|
|
// Schedule next batch if there are more, using Timers (IWithTimers)
|
|
if (endIdx < state.Configs.Count)
|
|
{
|
|
var nextState = new BatchState(state.Configs, endIdx);
|
|
Timers.StartSingleTimer(
|
|
"startup-batch",
|
|
new StartNextBatch(nextState),
|
|
TimeSpan.FromMilliseconds(_options.StartupBatchDelayMs));
|
|
}
|
|
else
|
|
{
|
|
_logger.LogInformation(
|
|
"All {Count} Instance Actors created", state.Configs.Count);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a new deployment: stores config in SQLite, clears previous static overrides,
|
|
/// and creates or replaces the Instance Actor.
|
|
///
|
|
/// Redeployment of an already-running instance must wait for the previous Instance
|
|
/// Actor to fully terminate (including PostStop on its descendants) before the
|
|
/// replacement is created — otherwise <see cref="Context.ActorOf"/> can collide on
|
|
/// the still-registered child name. Instead of guessing with a fixed timer, the
|
|
/// terminating child is watched and the in-flight command is buffered until the
|
|
/// <see cref="Terminated"/> signal arrives.
|
|
/// </summary>
|
|
private void HandleDeploy(DeployInstanceCommand command)
|
|
{
|
|
var instanceName = command.InstanceUniqueName;
|
|
_logger.LogInformation(
|
|
"Deploying instance {Instance}, deploymentId={DeploymentId}",
|
|
instanceName, command.DeploymentId);
|
|
|
|
// Redeployment replaces a running instance. Watch + stop the existing actor
|
|
// and buffer this command until its Terminated signal confirms the child
|
|
// (and its whole subtree) has fully stopped and freed its actor name.
|
|
if (_instanceActors.TryGetValue(instanceName, out var existing))
|
|
{
|
|
_instanceActors.Remove(instanceName);
|
|
_pendingRedeploys[existing] = new PendingRedeploy(command, Sender);
|
|
_terminatingActorsByName[instanceName] = existing;
|
|
Context.Watch(existing);
|
|
Context.Stop(existing);
|
|
UpdateInstanceCounts();
|
|
return;
|
|
}
|
|
|
|
// SiteRuntime-020: a deploy arriving while the previous redeploy is still
|
|
// terminating (the Terminated signal hasn't fired yet) used to fall through
|
|
// to ApplyDeployment(fresh), where Context.ActorOf would throw
|
|
// InvalidActorNameException because the child name is still registered.
|
|
// Detect the mid-termination state and overwrite the buffered pending
|
|
// command (last-write-wins) so the latest deploy is applied when Terminated
|
|
// arrives. The displaced sender is told Failed-superseded so it doesn't
|
|
// wait forever.
|
|
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
|
{
|
|
if (_pendingRedeploys.TryGetValue(terminatingRef, out var displaced))
|
|
{
|
|
displaced.OriginalSender.Tell(new DeploymentStatusResponse(
|
|
displaced.Command.DeploymentId,
|
|
instanceName,
|
|
DeploymentStatus.Failed,
|
|
$"superseded by newer deployment {command.DeploymentId} before predecessor finished terminating",
|
|
DateTimeOffset.UtcNow));
|
|
}
|
|
_pendingRedeploys[terminatingRef] = new PendingRedeploy(command, Sender);
|
|
return;
|
|
}
|
|
|
|
// Fresh deployment — no existing actor to replace.
|
|
ApplyDeployment(command, Sender, isRedeploy: false);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Recreates an Instance Actor once its predecessor has fully terminated during a
|
|
/// redeployment, draining the buffered <see cref="DeployInstanceCommand"/>.
|
|
/// </summary>
|
|
private void HandleTerminated(Terminated terminated)
|
|
{
|
|
if (!_pendingRedeploys.Remove(terminated.ActorRef, out var pending))
|
|
return;
|
|
|
|
// SiteRuntime-020: drop the name-keyed shadow now that the predecessor has
|
|
// fully terminated and its actor name is free. Any deploy arriving after
|
|
// this point sees neither _instanceActors[name] (cleared when we stopped
|
|
// the predecessor) nor _terminatingActorsByName[name] (cleared here), so
|
|
// ApplyDeployment + Context.ActorOf below safely reuses the name.
|
|
_terminatingActorsByName.Remove(pending.Command.InstanceUniqueName);
|
|
|
|
ApplyDeployment(pending.Command, pending.OriginalSender, isRedeploy: true);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates the Instance Actor, persists the config, and replies to the deployer.
|
|
/// A redeployment is an update of an existing instance, so the deployed-instance
|
|
/// counter is only incremented for genuinely new deployments.
|
|
///
|
|
/// SiteRuntime-005: the deployer is <b>not</b> told <see cref="DeploymentStatus.Success"/>
|
|
/// until SQLite persistence has committed. The site's deployed-config store is the
|
|
/// durable source of truth — a config that was never persisted would be silently lost
|
|
/// on the next restart/failover, so reporting Success before the row is committed is
|
|
/// incorrect. The reply is sent from <see cref="HandleDeployPersistenceResult"/> once
|
|
/// the persistence outcome is known.
|
|
/// </summary>
|
|
private void ApplyDeployment(DeployInstanceCommand command, IActorRef sender, bool isRedeploy)
|
|
{
|
|
var instanceName = command.InstanceUniqueName;
|
|
|
|
// Ensure DCL connections exist for any data-sourced attributes
|
|
EnsureDclConnections(command.FlattenedConfigurationJson);
|
|
|
|
// Create the Instance Actor immediately
|
|
CreateInstanceActor(instanceName, command.FlattenedConfigurationJson);
|
|
if (!isRedeploy)
|
|
_totalDeployedCount++;
|
|
UpdateInstanceCounts();
|
|
|
|
// Persist to SQLite and clear static overrides asynchronously
|
|
Task.Run(async () =>
|
|
{
|
|
await _storage.StoreDeployedConfigAsync(
|
|
instanceName,
|
|
command.FlattenedConfigurationJson,
|
|
command.DeploymentId,
|
|
command.RevisionHash,
|
|
isEnabled: true);
|
|
|
|
// Static overrides and mirrored native alarm state are reset on
|
|
// redeployment per design decision — the new config may bind different
|
|
// sources, and the source snapshot re-seeds the mirror on (re)subscribe.
|
|
await _storage.ClearStaticOverridesAsync(instanceName);
|
|
await _storage.ClearNativeAlarmsForInstanceAsync(instanceName);
|
|
|
|
// Replicate to standby node
|
|
_replicationActor?.Tell(new ReplicateConfigDeploy(
|
|
instanceName, command.FlattenedConfigurationJson,
|
|
command.DeploymentId, command.RevisionHash, true));
|
|
|
|
return new DeployPersistenceResult(
|
|
command.DeploymentId, instanceName, true, null, sender, isRedeploy);
|
|
}).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
return t.Result;
|
|
return new DeployPersistenceResult(
|
|
command.DeploymentId, instanceName, false,
|
|
t.Exception?.GetBaseException().Message, sender, isRedeploy);
|
|
}).PipeTo(Self);
|
|
}
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-005: reports the deployment outcome to central only after the
|
|
/// persistence result is known. On a persistence failure the Instance Actor that was
|
|
/// created optimistically is stopped and the deployed-instance counter rolled back,
|
|
/// so the in-memory state stays consistent with durable storage, and central is told
|
|
/// the deployment <see cref="DeploymentStatus.Failed"/>.
|
|
/// </summary>
|
|
private void HandleDeployPersistenceResult(DeployPersistenceResult result)
|
|
{
|
|
if (result.Success)
|
|
{
|
|
// M1.6: operational `deployment` event — deploy succeeded.
|
|
LogDeploymentEvent("Info", result.InstanceName,
|
|
$"Instance {result.InstanceName} deployed (deploymentId={result.DeploymentId})");
|
|
|
|
result.OriginalSender.Tell(new DeploymentStatusResponse(
|
|
result.DeploymentId,
|
|
result.InstanceName,
|
|
DeploymentStatus.Success,
|
|
null,
|
|
DateTimeOffset.UtcNow));
|
|
return;
|
|
}
|
|
|
|
_logger.LogError(
|
|
"Failed to persist deployment {DeploymentId} for {Instance}: {Error}",
|
|
result.DeploymentId, result.InstanceName, result.Error);
|
|
|
|
// M1.6: operational `deployment` event — deploy failed.
|
|
LogDeploymentEvent("Error", result.InstanceName,
|
|
$"Instance {result.InstanceName} deploy failed (deploymentId={result.DeploymentId})",
|
|
result.Error);
|
|
|
|
// Persistence failed — undo the optimistic actor creation and counter bump so
|
|
// the site does not advertise an instance it cannot durably recover.
|
|
if (_instanceActors.Remove(result.InstanceName, out var orphan))
|
|
Context.Stop(orphan);
|
|
if (!result.IsRedeploy)
|
|
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
|
UpdateInstanceCounts();
|
|
|
|
result.OriginalSender.Tell(new DeploymentStatusResponse(
|
|
result.DeploymentId,
|
|
result.InstanceName,
|
|
DeploymentStatus.Failed,
|
|
result.Error ?? "Deployment persistence failed",
|
|
DateTimeOffset.UtcNow));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Disables an instance: stops the actor and marks as disabled in SQLite.
|
|
/// </summary>
|
|
private void HandleDisable(DisableInstanceCommand command)
|
|
{
|
|
var instanceName = command.InstanceUniqueName;
|
|
|
|
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
|
{
|
|
Context.Stop(actor);
|
|
_instanceActors.Remove(instanceName);
|
|
}
|
|
|
|
UpdateInstanceCounts();
|
|
|
|
var sender = Sender;
|
|
_storage.SetInstanceEnabledAsync(instanceName, false).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
{
|
|
_replicationActor?.Tell(new ReplicateConfigSetEnabled(instanceName, false));
|
|
// M1.6: operational `deployment` event — disable succeeded.
|
|
LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} disabled");
|
|
}
|
|
else
|
|
{
|
|
LogDeploymentEvent("Error", instanceName,
|
|
$"Instance {instanceName} disable failed",
|
|
t.Exception?.GetBaseException().Message);
|
|
}
|
|
|
|
return new InstanceLifecycleResponse(
|
|
command.CommandId,
|
|
instanceName,
|
|
t.IsCompletedSuccessfully,
|
|
t.Exception?.GetBaseException().Message,
|
|
DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
|
|
_logger.LogInformation("Instance {Instance} disabled", instanceName);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Enables an instance: marks as enabled in SQLite and re-creates the Instance Actor
|
|
/// from the stored config.
|
|
/// </summary>
|
|
private void HandleEnable(EnableInstanceCommand command)
|
|
{
|
|
var instanceName = command.InstanceUniqueName;
|
|
var sender = Sender;
|
|
|
|
Task.Run(async () =>
|
|
{
|
|
await _storage.SetInstanceEnabledAsync(instanceName, true);
|
|
_replicationActor?.Tell(new ReplicateConfigSetEnabled(instanceName, true));
|
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
|
var config = configs.FirstOrDefault(c => c.InstanceUniqueName == instanceName);
|
|
return new EnableResult(command, config, null, sender);
|
|
}).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
return t.Result;
|
|
return new EnableResult(command, null, t.Exception?.GetBaseException().Message, sender);
|
|
}).PipeTo(Self);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Processes enable result in the actor context (thread-safe).
|
|
/// </summary>
|
|
private void HandleEnableResult(EnableResult result)
|
|
{
|
|
var instanceName = result.Command.InstanceUniqueName;
|
|
|
|
if (result.Error != null || result.Config == null)
|
|
{
|
|
var error = result.Error ?? $"No deployed config found for {instanceName}";
|
|
// M1.6: operational `deployment` event — enable failed.
|
|
LogDeploymentEvent("Error", instanceName,
|
|
$"Instance {instanceName} enable failed", error);
|
|
result.OriginalSender.Tell(new InstanceLifecycleResponse(
|
|
result.Command.CommandId, instanceName, false, error, DateTimeOffset.UtcNow));
|
|
return;
|
|
}
|
|
|
|
if (!_instanceActors.ContainsKey(instanceName))
|
|
{
|
|
CreateInstanceActor(instanceName, result.Config.ConfigJson);
|
|
}
|
|
UpdateInstanceCounts();
|
|
|
|
// M1.6: operational `deployment` event — enable succeeded.
|
|
LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} enabled");
|
|
|
|
result.OriginalSender.Tell(new InstanceLifecycleResponse(
|
|
result.Command.CommandId, instanceName, true, null, DateTimeOffset.UtcNow));
|
|
|
|
_logger.LogInformation("Instance {Instance} enabled", instanceName);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Deletes an instance: stops the actor and removes config from SQLite.
|
|
/// Note: store-and-forward messages are NOT cleared per design decision.
|
|
/// </summary>
|
|
private void HandleDelete(DeleteInstanceCommand command)
|
|
{
|
|
var instanceName = command.InstanceUniqueName;
|
|
|
|
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
|
{
|
|
Context.Stop(actor);
|
|
_instanceActors.Remove(instanceName);
|
|
}
|
|
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
|
UpdateInstanceCounts();
|
|
|
|
var sender = Sender;
|
|
_storage.RemoveDeployedConfigAsync(instanceName).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
{
|
|
_replicationActor?.Tell(new ReplicateConfigRemove(instanceName));
|
|
// M1.6: operational `deployment` event — delete succeeded.
|
|
LogDeploymentEvent("Info", instanceName, $"Instance {instanceName} deleted");
|
|
}
|
|
else
|
|
{
|
|
LogDeploymentEvent("Error", instanceName,
|
|
$"Instance {instanceName} delete failed",
|
|
t.Exception?.GetBaseException().Message);
|
|
}
|
|
|
|
return new InstanceLifecycleResponse(
|
|
command.CommandId,
|
|
instanceName,
|
|
t.IsCompletedSuccessfully,
|
|
t.Exception?.GetBaseException().Message,
|
|
DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
|
|
_logger.LogInformation("Instance {Instance} deleted", instanceName);
|
|
}
|
|
|
|
/// <summary>
|
|
/// M1.6: fire-and-forget a <c>deployment</c> operational event to the optional
|
|
/// <see cref="ISiteEventLogger"/> on a deploy/enable/disable/delete outcome.
|
|
/// Resolved optionally and never awaited so a logging failure cannot affect the
|
|
/// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor
|
|
/// pattern).
|
|
/// <para>
|
|
/// <b>Thread-safety:</b> the disable (<see cref="HandleDisable"/>) and delete
|
|
/// (<see cref="HandleDelete"/>) paths call this from a
|
|
/// <see cref="System.Threading.Tasks.Task.ContinueWith(System.Action{System.Threading.Tasks.Task})"/>
|
|
/// continuation that runs on a thread-pool thread, NOT on the actor thread —
|
|
/// so it must touch only immutable, thread-safe state. It does: the only
|
|
/// field it reads is the <c>readonly _serviceProvider</c> captured at
|
|
/// construction (the resolved <see cref="ISiteEventLogger"/> is a process
|
|
/// singleton). No actor-private mutable state is referenced, which is what
|
|
/// makes calling it off the actor thread safe.
|
|
/// </para>
|
|
/// </summary>
|
|
private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null)
|
|
{
|
|
_ = _serviceProvider?.GetService<ISiteEventLogger>()?.LogEventAsync(
|
|
"deployment", severity, instanceName, "DeploymentManagerActor", message, details);
|
|
}
|
|
|
|
/// <summary>
|
|
/// DeploymentManager-006: answers a central query for the instance's
|
|
/// currently-applied deployment identity. The site's deployed-config store
|
|
/// (SQLite) is the authoritative record — it covers both enabled and
|
|
/// disabled instances, and survives node restart/failover. If the instance
|
|
/// has no stored config, the response reports <c>IsDeployed = false</c> with
|
|
/// null identity so central falls through to a normal deploy.
|
|
/// </summary>
|
|
private void HandleDeploymentStateQuery(DeploymentStateQueryRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var instanceName = request.InstanceUniqueName;
|
|
|
|
_storage.GetAllDeployedConfigsAsync().ContinueWith(t =>
|
|
{
|
|
if (!t.IsCompletedSuccessfully)
|
|
{
|
|
_logger.LogError(
|
|
t.Exception?.GetBaseException(),
|
|
"Failed to read deployed configs for deployment state query of {Instance}",
|
|
instanceName);
|
|
// Treat a storage read failure as "unknown" — central falls
|
|
// through to a normal deploy and relies on site-side
|
|
// stale-rejection as the safety net.
|
|
return new DeploymentStateQueryResponse(
|
|
request.CorrelationId, instanceName, false, null, null, DateTimeOffset.UtcNow);
|
|
}
|
|
|
|
var config = t.Result.FirstOrDefault(c => c.InstanceUniqueName == instanceName);
|
|
return config == null
|
|
? new DeploymentStateQueryResponse(
|
|
request.CorrelationId, instanceName, false, null, null, DateTimeOffset.UtcNow)
|
|
: new DeploymentStateQueryResponse(
|
|
request.CorrelationId, instanceName, true,
|
|
config.DeploymentId, config.RevisionHash, DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
}
|
|
|
|
// ── DCL connection management ──
|
|
|
|
/// <summary>
|
|
/// Tracks the configuration last sent to the DCL for each connection name, keyed by
|
|
/// a hash of the connection's protocol/endpoints/credentials/failover count
|
|
/// (SiteRuntime-010). A name whose hash is unchanged is skipped; a name whose config
|
|
/// changed re-issues a <c>CreateConnectionCommand</c> so the DCL adopts the new
|
|
/// configuration instead of keeping a stale connection after a redeployment.
|
|
/// </summary>
|
|
private readonly Dictionary<string, string> _createdConnections = new();
|
|
|
|
/// <summary>
|
|
/// Sets up DCL connections from the flattened config. Idempotent on unchanged
|
|
/// configuration, but re-issues the create command when a connection's endpoint,
|
|
/// credentials, backup endpoint, or failover retry count has changed since it was
|
|
/// last sent (SiteRuntime-010).
|
|
/// </summary>
|
|
private void EnsureDclConnections(string configJson)
|
|
{
|
|
if (_dclManager == null) return;
|
|
|
|
try
|
|
{
|
|
var config = System.Text.Json.JsonSerializer.Deserialize<Commons.Types.Flattening.FlattenedConfiguration>(configJson);
|
|
if (config?.Connections == null) return;
|
|
|
|
foreach (var (name, connConfig) in config.Connections)
|
|
{
|
|
EnsureDclConnection(
|
|
name,
|
|
connConfig.Protocol,
|
|
connConfig.ConfigurationJson,
|
|
connConfig.BackupConfigurationJson,
|
|
connConfig.FailoverRetryCount);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to parse flattened config for DCL connections");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-021: hash-guarded DCL connection push shared by the inline
|
|
/// per-instance path (<see cref="EnsureDclConnections(string)"/>) and the
|
|
/// system-wide artifact-deploy path (<see cref="HandleDeployArtifacts"/>).
|
|
/// Unchanged config is a no-op; a changed endpoint/credentials/backup/
|
|
/// failover-count re-issues a <c>CreateConnectionCommand</c> so a system-
|
|
/// wide artifact-deploy makes its data-connection change live immediately
|
|
/// (the artifact-deploy path previously only persisted to SQLite — the
|
|
/// DCL didn't see the change until next instance redeploy or node
|
|
/// restart, contradicting the "site is self-contained after artifact
|
|
/// deployment" intent).
|
|
/// </summary>
|
|
private void EnsureDclConnection(
|
|
string name,
|
|
string protocol,
|
|
string? primaryConfigurationJson,
|
|
string? backupConfigurationJson,
|
|
int failoverRetryCount)
|
|
{
|
|
if (_dclManager == null) return;
|
|
|
|
var configHash = ComputeConnectionConfigHashCore(
|
|
protocol, primaryConfigurationJson, backupConfigurationJson, failoverRetryCount);
|
|
if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash)
|
|
return;
|
|
|
|
var primaryDetails = FlattenConnectionConfig(protocol, primaryConfigurationJson);
|
|
var backupDetails = string.IsNullOrEmpty(backupConfigurationJson)
|
|
? null
|
|
: FlattenConnectionConfig(protocol, backupConfigurationJson);
|
|
|
|
_dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand(
|
|
name, protocol, primaryDetails, backupDetails, failoverRetryCount));
|
|
|
|
var changed = _createdConnections.ContainsKey(name);
|
|
_createdConnections[name] = configHash;
|
|
_logger.LogInformation(
|
|
"{Action} DCL connection {Connection} (protocol={Protocol})",
|
|
changed ? "Updated" : "Created", name, protocol);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Computes a stable hash over the configuration fields that affect how the DCL
|
|
/// connects, so a changed endpoint/credential/backup/failover count is detected
|
|
/// (SiteRuntime-010).
|
|
/// </summary>
|
|
private static string ComputeConnectionConfigHash(
|
|
Commons.Types.Flattening.ConnectionConfig connConfig) =>
|
|
ComputeConnectionConfigHashCore(
|
|
connConfig.Protocol,
|
|
connConfig.ConfigurationJson,
|
|
connConfig.BackupConfigurationJson,
|
|
connConfig.FailoverRetryCount);
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-021: field-based core so the system-wide artifact-deploy
|
|
/// path (which carries protocol/config-json/backup-json/failover directly
|
|
/// on <see cref="Commons.Messages.Artifacts.DataConnectionArtifact"/>) can
|
|
/// share the same hash + skip-or-resend logic as the inline-config path.
|
|
/// </summary>
|
|
private static string ComputeConnectionConfigHashCore(
|
|
string protocol,
|
|
string? primaryConfigurationJson,
|
|
string? backupConfigurationJson,
|
|
int failoverRetryCount)
|
|
{
|
|
var material = string.Join(
|
|
"",
|
|
protocol,
|
|
primaryConfigurationJson ?? string.Empty,
|
|
backupConfigurationJson ?? string.Empty,
|
|
failoverRetryCount.ToString(System.Globalization.CultureInfo.InvariantCulture));
|
|
|
|
var bytes = System.Security.Cryptography.SHA256.HashData(
|
|
System.Text.Encoding.UTF8.GetBytes(material));
|
|
return Convert.ToHexString(bytes);
|
|
}
|
|
|
|
private static IDictionary<string, string> FlattenConnectionConfig(string protocol, string? json)
|
|
{
|
|
if (string.IsNullOrEmpty(json))
|
|
return new Dictionary<string, string>();
|
|
|
|
if (string.Equals(protocol, "OpcUa", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var (config, _) = Commons.Serialization.OpcUaEndpointConfigSerializer.Deserialize(json);
|
|
return Commons.Serialization.OpcUaEndpointConfigSerializer.ToFlatDict(config);
|
|
}
|
|
|
|
if (string.Equals(protocol, "MxGateway", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var config = Commons.Serialization.MxGatewayEndpointConfigSerializer.Deserialize(json);
|
|
return Commons.Serialization.MxGatewayEndpointConfigSerializer.ToFlatDict(config);
|
|
}
|
|
|
|
// Fallback: assume legacy flat-dict shape for any future / unknown protocol.
|
|
try
|
|
{
|
|
var dict = new Dictionary<string, string>();
|
|
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
|
foreach (var prop in doc.RootElement.EnumerateObject())
|
|
dict[prop.Name] = prop.Value.ToString();
|
|
return dict;
|
|
}
|
|
catch
|
|
{
|
|
return new Dictionary<string, string>();
|
|
}
|
|
}
|
|
|
|
// ── Shared Script Loading ──
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-008: reads and compiles all shared scripts on a background task so the
|
|
/// SQLite read and Roslyn compilation never block the singleton's mailbox thread. The
|
|
/// result is piped back as a <see cref="SharedScriptsLoaded"/> message, carrying the
|
|
/// enabled configs to resume staggered Instance Actor creation on the actor thread.
|
|
/// </summary>
|
|
private void LoadSharedScriptsFromStorage(List<DeployedInstance> enabledConfigs)
|
|
{
|
|
Task.Run(async () =>
|
|
{
|
|
var scripts = await _storage.GetAllSharedScriptsAsync();
|
|
var compiled = 0;
|
|
foreach (var script in scripts)
|
|
{
|
|
if (_sharedScriptLibrary.CompileAndRegister(script.Name, script.Code))
|
|
compiled++;
|
|
}
|
|
return new SharedScriptsLoaded(enabledConfigs, compiled, scripts.Count);
|
|
}).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
return t.Result;
|
|
_logger.LogError(
|
|
t.Exception?.GetBaseException(), "Failed to load shared scripts from SQLite");
|
|
// A shared-script load failure must not abandon startup — proceed with
|
|
// Instance Actor creation; scripts that need a missing shared script fail
|
|
// at execution time and are recorded to the site event log.
|
|
return new SharedScriptsLoaded(enabledConfigs, 0, 0);
|
|
}).PipeTo(Self);
|
|
}
|
|
|
|
// ── Debug View routing ──
|
|
|
|
private void RouteDebugViewSubscribe(SubscribeDebugViewRequest request)
|
|
{
|
|
if (_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
instanceActor.Forward(request);
|
|
}
|
|
else
|
|
{
|
|
// M2.11: set InstanceNotFound=true so the caller can distinguish
|
|
// "not deployed on this site" from a deployed-but-empty instance.
|
|
_logger.LogWarning(
|
|
"Debug view subscribe for unknown instance {Instance}", request.InstanceUniqueName);
|
|
Sender.Tell(new DebugViewSnapshot(
|
|
request.InstanceUniqueName, Array.Empty<Commons.Messages.Streaming.AttributeValueChanged>(),
|
|
Array.Empty<Commons.Messages.Streaming.AlarmStateChanged>(), DateTimeOffset.UtcNow,
|
|
InstanceNotFound: true));
|
|
}
|
|
}
|
|
|
|
private void RouteDebugViewUnsubscribe(UnsubscribeDebugViewRequest request)
|
|
{
|
|
if (_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
instanceActor.Forward(request);
|
|
}
|
|
}
|
|
|
|
private void RouteDebugSnapshot(DebugSnapshotRequest request)
|
|
{
|
|
if (_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
instanceActor.Forward(request);
|
|
}
|
|
else
|
|
{
|
|
// M2.11: set InstanceNotFound=true so the caller can distinguish
|
|
// "not deployed on this site" from a deployed-but-empty instance.
|
|
_logger.LogWarning(
|
|
"Debug snapshot for unknown instance {Instance}", request.InstanceUniqueName);
|
|
Sender.Tell(new DebugViewSnapshot(
|
|
request.InstanceUniqueName, Array.Empty<Commons.Messages.Streaming.AttributeValueChanged>(),
|
|
Array.Empty<Commons.Messages.Streaming.AlarmStateChanged>(), DateTimeOffset.UtcNow,
|
|
InstanceNotFound: true));
|
|
}
|
|
}
|
|
|
|
// ── Inbound API routing ──
|
|
|
|
private void RouteInboundApiCall(RouteToCallRequest request)
|
|
{
|
|
if (_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
// Convert to ScriptCallRequest and Ask the Instance Actor.
|
|
// Audit Log #23 (ParentExecutionId): carry the inbound request's
|
|
// ExecutionId down as ParentExecutionId so the routed script
|
|
// execution can record its spawner.
|
|
var scriptCall = new ScriptCallRequest(
|
|
request.ScriptName, request.Parameters, 0, request.CorrelationId,
|
|
ParentExecutionId: request.ParentExecutionId);
|
|
var sender = Sender;
|
|
instanceActor.Ask<ScriptCallResult>(scriptCall, TimeSpan.FromSeconds(30))
|
|
.ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
{
|
|
var result = t.Result;
|
|
return new RouteToCallResponse(
|
|
request.CorrelationId, result.Success,
|
|
// The routed script's return value crosses the Central↔Site
|
|
// PROCESS boundary inside this response. A script's natural
|
|
// `return new { ... }` is an anonymous type, which Akka's
|
|
// cross-process serializer cannot round-trip — the reply is
|
|
// silently dropped and the caller's Route.To().Call() Ask
|
|
// times out even though the script succeeded. Project the
|
|
// value to a plain CLR graph (Dictionary/List/primitive) that
|
|
// round-trips the wire and reproduces the same JSON shape on
|
|
// the inbound side (which JsonSerializer-serializes it for the
|
|
// HTTP body / ReturnDefinition validation).
|
|
NormalizeRoutedReturnValue(result.ReturnValue),
|
|
result.ErrorMessage, DateTimeOffset.UtcNow);
|
|
}
|
|
return new RouteToCallResponse(
|
|
request.CorrelationId, false, null,
|
|
t.Exception?.GetBaseException().Message ?? "Script call timed out",
|
|
DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
}
|
|
else
|
|
{
|
|
Sender.Tell(new RouteToCallResponse(
|
|
request.CorrelationId, false, null,
|
|
$"Instance '{request.InstanceUniqueName}' not found on this site.",
|
|
DateTimeOffset.UtcNow));
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Projects a routed script's return value to a cross-process-serializable plain
|
|
/// CLR graph (<see cref="Dictionary{TKey,TValue}"/> / <see cref="List{T}"/> /
|
|
/// string / long / double / bool / null) via a JSON round-trip. A script's natural
|
|
/// <c>return new { ... }</c> is a compiler-generated anonymous type that Akka's
|
|
/// cross-process serializer cannot reconstruct on the receiving node, so the
|
|
/// <see cref="RouteToCallResponse"/> reply is silently dropped and the caller's
|
|
/// Ask times out. The projected graph round-trips the wire and re-serializes to the
|
|
/// same JSON shape the inbound API expects. Returns <paramref name="value"/>
|
|
/// unchanged on a (non-expected) serialization failure so a quirky value still has a
|
|
/// chance to deliver rather than being forced to null.
|
|
/// </summary>
|
|
private object? NormalizeRoutedReturnValue(object? value)
|
|
{
|
|
if (value is null)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
try
|
|
{
|
|
var json = System.Text.Json.JsonSerializer.Serialize(value);
|
|
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
|
return FromJsonElement(doc.RootElement);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Failed to normalize routed script return value of type {Type} for transport; sending as-is",
|
|
value.GetType().Name);
|
|
return value;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Converts a <see cref="System.Text.Json.JsonElement"/> to a plain CLR value
|
|
/// (string / long / double / bool / null, or nested Dictionary / List) — never a
|
|
/// <see cref="System.Text.Json.JsonElement"/> (which is itself not cross-process
|
|
/// serializable). Companion to <see cref="NormalizeRoutedReturnValue"/>.
|
|
/// </summary>
|
|
private static object? FromJsonElement(System.Text.Json.JsonElement e) => e.ValueKind switch
|
|
{
|
|
System.Text.Json.JsonValueKind.Object =>
|
|
e.EnumerateObject().ToDictionary(p => p.Name, p => FromJsonElement(p.Value)),
|
|
System.Text.Json.JsonValueKind.Array =>
|
|
e.EnumerateArray().Select(FromJsonElement).ToList(),
|
|
System.Text.Json.JsonValueKind.String => e.GetString(),
|
|
System.Text.Json.JsonValueKind.Number => e.TryGetInt64(out var l) ? l : e.GetDouble(),
|
|
System.Text.Json.JsonValueKind.True => true,
|
|
System.Text.Json.JsonValueKind.False => false,
|
|
_ => null,
|
|
};
|
|
|
|
/// <summary>
|
|
/// Reads attribute values from a deployed instance for a Route.To().GetAttribute(s)
|
|
/// call (or a central Test Run bound to the instance). Asks the Instance Actor
|
|
/// per attribute and combines the results.
|
|
/// </summary>
|
|
private void RouteInboundApiGetAttributes(RouteToGetAttributesRequest request)
|
|
{
|
|
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
Sender.Tell(new RouteToGetAttributesResponse(
|
|
request.CorrelationId, new Dictionary<string, object?>(), false,
|
|
$"Instance '{request.InstanceUniqueName}' not found on this site.",
|
|
DateTimeOffset.UtcNow));
|
|
return;
|
|
}
|
|
|
|
var sender = Sender;
|
|
var names = request.AttributeNames;
|
|
var asks = names
|
|
.Select(name => instanceActor.Ask<GetAttributeResponse>(
|
|
new GetAttributeRequest(
|
|
request.CorrelationId, request.InstanceUniqueName, name, DateTimeOffset.UtcNow),
|
|
TimeSpan.FromSeconds(30)))
|
|
.ToArray();
|
|
|
|
Task.WhenAll(asks).ContinueWith(t =>
|
|
{
|
|
if (t.IsCompletedSuccessfully)
|
|
{
|
|
var values = new Dictionary<string, object?>();
|
|
for (var i = 0; i < names.Count; i++)
|
|
values[names[i]] = t.Result[i].Found ? t.Result[i].Value : null;
|
|
return new RouteToGetAttributesResponse(
|
|
request.CorrelationId, values, true, null, DateTimeOffset.UtcNow);
|
|
}
|
|
return new RouteToGetAttributesResponse(
|
|
request.CorrelationId, new Dictionary<string, object?>(), false,
|
|
t.Exception?.GetBaseException().Message ?? "Attribute read timed out",
|
|
DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Spec §6 (WD-2b): unpacks a routed <see cref="RouteToWaitForAttributeRequest"/>
|
|
/// (inbound-API <c>Route.To().WaitForAttribute()</c>) into the deployed
|
|
/// Instance Actor's site-local <see cref="WaitForAttributeRequest"/> and relays
|
|
/// the result back. Value-equality only across the wire — the predicate is null
|
|
/// and <c>RequireGoodQuality</c> is left at its default. The Ask is bounded by the
|
|
/// wait timeout plus slack (NOT a fixed 30s), since the wait legitimately blocks
|
|
/// for up to <see cref="RouteToWaitForAttributeRequest.Timeout"/>.
|
|
/// </summary>
|
|
private void RouteInboundApiWaitForAttribute(RouteToWaitForAttributeRequest request)
|
|
{
|
|
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
Sender.Tell(new RouteToWaitForAttributeResponse(
|
|
request.CorrelationId, false, null, null, false,
|
|
false, $"Instance '{request.InstanceUniqueName}' not found on this site.",
|
|
DateTimeOffset.UtcNow));
|
|
return;
|
|
}
|
|
|
|
var sender = Sender;
|
|
// Routed waits are value-equality only (predicate null); RequireGoodQuality left at default.
|
|
var inner = new WaitForAttributeRequest(
|
|
request.CorrelationId, request.InstanceUniqueName, request.AttributeName,
|
|
request.TargetValueEncoded, null, request.Timeout, DateTimeOffset.UtcNow);
|
|
|
|
// Ask bounded by the WAIT timeout + slack — NOT a fixed 30s (the wait legitimately blocks up to request.Timeout).
|
|
instanceActor.Ask<WaitForAttributeResponse>(inner, request.Timeout + TimeSpan.FromSeconds(5))
|
|
.ContinueWith(t => t.IsCompletedSuccessfully
|
|
? new RouteToWaitForAttributeResponse(
|
|
request.CorrelationId, t.Result.Matched, t.Result.Value, t.Result.Quality, t.Result.TimedOut,
|
|
true, null, DateTimeOffset.UtcNow)
|
|
: new RouteToWaitForAttributeResponse(
|
|
request.CorrelationId, false, null, null, false,
|
|
false, t.Exception?.GetBaseException().Message ?? "Attribute wait timed out",
|
|
DateTimeOffset.UtcNow))
|
|
.PipeTo(sender);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Writes attribute values on a deployed instance for a Route.To().SetAttribute(s)
|
|
/// call (or a central Test Run bound to the instance). Each write is Ask'd to the
|
|
/// Instance Actor, which routes data-sourced attributes through the DCL and static
|
|
/// attributes to a persisted override. The response reflects the real per-attribute
|
|
/// outcome (a non-existent attribute or a failed device write reports failure),
|
|
/// rather than an unconditional optimistic ack.
|
|
/// </summary>
|
|
private void RouteInboundApiSetAttributes(RouteToSetAttributesRequest request)
|
|
{
|
|
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
|
{
|
|
Sender.Tell(new RouteToSetAttributesResponse(
|
|
request.CorrelationId, false,
|
|
$"Instance '{request.InstanceUniqueName}' not found on this site.",
|
|
DateTimeOffset.UtcNow));
|
|
return;
|
|
}
|
|
|
|
var sender = Sender;
|
|
var correlationId = request.CorrelationId;
|
|
var asks = request.AttributeValues
|
|
.Select(kvp => instanceActor.Ask<SetStaticAttributeResponse>(
|
|
new SetStaticAttributeCommand(
|
|
correlationId, request.InstanceUniqueName, kvp.Key, kvp.Value, DateTimeOffset.UtcNow),
|
|
TimeSpan.FromSeconds(30)))
|
|
.ToArray();
|
|
|
|
Task.WhenAll(asks).ContinueWith(t =>
|
|
{
|
|
if (!t.IsCompletedSuccessfully)
|
|
return new RouteToSetAttributesResponse(
|
|
correlationId, false,
|
|
t.Exception?.GetBaseException().Message ?? "Attribute write timed out",
|
|
DateTimeOffset.UtcNow);
|
|
|
|
var failures = t.Result
|
|
.Where(r => !r.Success)
|
|
.Select(r => $"{r.AttributeName}: {r.ErrorMessage}")
|
|
.ToArray();
|
|
|
|
return failures.Length == 0
|
|
? new RouteToSetAttributesResponse(correlationId, true, null, DateTimeOffset.UtcNow)
|
|
: new RouteToSetAttributesResponse(
|
|
correlationId, false, string.Join("; ", failures), DateTimeOffset.UtcNow);
|
|
}).PipeTo(sender);
|
|
}
|
|
|
|
/// <summary>
|
|
/// WP-33: Handles system-wide artifact deployment (shared scripts, external systems, etc.).
|
|
/// Persists artifacts to SiteStorageService and recompiles shared scripts.
|
|
/// </summary>
|
|
private void HandleDeployArtifacts(DeployArtifactsCommand command)
|
|
{
|
|
_logger.LogInformation(
|
|
"Deploying system artifacts, deploymentId={DeploymentId}", command.DeploymentId);
|
|
|
|
var sender = Sender;
|
|
// Capture Self before entering Task.Run: the Self/Sender/Context properties
|
|
// are backed by the ambient ActorCell, which is null on a thread-pool thread,
|
|
// so reading Self *inside* the lambda throws "no active ActorContext". The
|
|
// data-connections branch below dispatches via this captured ref.
|
|
var self = Self;
|
|
|
|
Task.Run(async () =>
|
|
{
|
|
try
|
|
{
|
|
// WP-33: Store shared scripts and recompile
|
|
if (command.SharedScripts != null)
|
|
{
|
|
foreach (var script in command.SharedScripts)
|
|
{
|
|
await _storage.StoreSharedScriptAsync(script.Name, script.Code,
|
|
script.ParameterDefinitions, script.ReturnDefinition);
|
|
|
|
// WP-33: Shared scripts recompiled on update
|
|
_sharedScriptLibrary.CompileAndRegister(script.Name, script.Code);
|
|
}
|
|
}
|
|
|
|
// WP-33: Store external system definitions
|
|
if (command.ExternalSystems != null)
|
|
{
|
|
foreach (var es in command.ExternalSystems)
|
|
{
|
|
await _storage.StoreExternalSystemAsync(es.Name, es.EndpointUrl,
|
|
es.AuthType, es.AuthConfiguration, es.MethodDefinitionsJson);
|
|
}
|
|
}
|
|
|
|
// WP-33: Store database connection definitions
|
|
if (command.DatabaseConnections != null)
|
|
{
|
|
foreach (var db in command.DatabaseConnections)
|
|
{
|
|
await _storage.StoreDatabaseConnectionAsync(db.Name, db.ConnectionString,
|
|
db.MaxRetries, db.RetryDelay);
|
|
}
|
|
}
|
|
|
|
// WP-33: Store notification lists
|
|
if (command.NotificationLists != null)
|
|
{
|
|
foreach (var nl in command.NotificationLists)
|
|
{
|
|
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
|
|
}
|
|
}
|
|
|
|
// Store data connection definitions (OPC UA endpoints, etc.)
|
|
if (command.DataConnections != null)
|
|
{
|
|
foreach (var dc in command.DataConnections)
|
|
{
|
|
await _storage.StoreDataConnectionDefinitionAsync(
|
|
dc.Name, dc.Protocol, dc.PrimaryConfigurationJson,
|
|
dc.BackupConfigurationJson, dc.FailoverRetryCount);
|
|
}
|
|
|
|
// SiteRuntime-021: after the SQLite store, dispatch an
|
|
// internal message back to the actor thread so the DCL
|
|
// push runs through EnsureDclConnection — keeping the
|
|
// _createdConnections hash cache mutation actor-thread-
|
|
// confined while still making the change live immediately
|
|
// (previously the change landed in SQLite but the DCL
|
|
// kept using the stale connection until next instance
|
|
// redeploy or node restart, contradicting "site is
|
|
// self-contained after artifact deployment"). The
|
|
// helper's hash cache skips unchanged definitions, so
|
|
// the push is idempotent for re-deploys of the same
|
|
// artifact bundle.
|
|
self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections));
|
|
}
|
|
|
|
// Store SMTP configurations
|
|
if (command.SmtpConfigurations != null)
|
|
{
|
|
foreach (var smtp in command.SmtpConfigurations)
|
|
{
|
|
await _storage.StoreSmtpConfigurationAsync(
|
|
smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
|
|
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
|
|
}
|
|
}
|
|
|
|
// Replicate artifacts to standby node
|
|
_replicationActor?.Tell(new ReplicateArtifacts(command));
|
|
|
|
return new ArtifactDeploymentResponse(
|
|
command.DeploymentId, "", true, null, DateTimeOffset.UtcNow);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return new ArtifactDeploymentResponse(
|
|
command.DeploymentId, "", false, ex.Message, DateTimeOffset.UtcNow);
|
|
}
|
|
}).PipeTo(sender);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a child Instance Actor with the given name and configuration JSON.
|
|
/// </summary>
|
|
/// <param name="instanceName">The unique name of the instance to create an actor for.</param>
|
|
/// <param name="configJson">The JSON-serialized flattened configuration for the instance.</param>
|
|
internal void CreateInstanceActor(string instanceName, string configJson)
|
|
{
|
|
if (_instanceActors.ContainsKey(instanceName))
|
|
{
|
|
_logger.LogWarning("Instance Actor {Instance} already exists, skipping creation", instanceName);
|
|
return;
|
|
}
|
|
|
|
// SiteRuntime-015: reuse the shared, host-configured logger factory
|
|
// instead of allocating (and leaking) a fresh LoggerFactory per instance.
|
|
var props = Props.Create(() => new InstanceActor(
|
|
instanceName,
|
|
configJson,
|
|
_storage,
|
|
_compilationService,
|
|
_sharedScriptLibrary,
|
|
_streamManager,
|
|
_options,
|
|
_loggerFactory.CreateLogger<InstanceActor>(),
|
|
_dclManager,
|
|
_healthCollector,
|
|
_serviceProvider));
|
|
|
|
var actorRef = Context.ActorOf(props, instanceName);
|
|
_instanceActors[instanceName] = actorRef;
|
|
|
|
_logger.LogDebug("Created Instance Actor for {Instance}", instanceName);
|
|
}
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-021: actor-thread handler that pushes artifact-deploy data
|
|
/// connection definitions to the DCL via the shared
|
|
/// <see cref="EnsureDclConnection"/> helper. Dispatched from
|
|
/// <see cref="HandleDeployArtifacts"/>'s off-thread Task so the
|
|
/// <see cref="_createdConnections"/> hash-cache mutation stays
|
|
/// actor-thread-confined.
|
|
/// </summary>
|
|
private void HandleApplyArtifactDataConnectionsToDcl(ApplyArtifactDataConnectionsToDcl msg)
|
|
{
|
|
foreach (var dc in msg.DataConnections)
|
|
{
|
|
EnsureDclConnection(
|
|
dc.Name,
|
|
dc.Protocol,
|
|
dc.PrimaryConfigurationJson,
|
|
dc.BackupConfigurationJson,
|
|
dc.FailoverRetryCount);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the count of active Instance Actors (for testing/diagnostics).
|
|
/// </summary>
|
|
internal int InstanceActorCount => _instanceActors.Count;
|
|
|
|
/// <summary>
|
|
/// Updates the health collector with current instance counts.
|
|
/// Total deployed = _totalDeployedCount, enabled = running actors, disabled = difference.
|
|
/// </summary>
|
|
private void UpdateInstanceCounts()
|
|
{
|
|
_healthCollector?.SetInstanceCounts(
|
|
deployed: _totalDeployedCount,
|
|
enabled: _instanceActors.Count,
|
|
disabled: _totalDeployedCount - _instanceActors.Count);
|
|
}
|
|
|
|
// ── Internal messages ──
|
|
|
|
internal record StartupConfigsLoaded(List<DeployedInstance> Configs, string? Error);
|
|
|
|
/// <summary>
|
|
/// Internal message piped back once shared scripts have been compiled off-thread
|
|
/// (SiteRuntime-008). Carries the enabled configs so staggered Instance Actor
|
|
/// creation resumes on the actor thread.
|
|
/// </summary>
|
|
internal record SharedScriptsLoaded(
|
|
List<DeployedInstance> EnabledConfigs, int CompiledCount, int TotalCount);
|
|
|
|
internal record StartNextBatch(BatchState State);
|
|
internal record BatchState(List<DeployedInstance> Configs, int NextIndex);
|
|
internal record EnableResult(
|
|
EnableInstanceCommand Command, DeployedInstance? Config, string? Error, IActorRef OriginalSender);
|
|
internal record DeployPersistenceResult(
|
|
string DeploymentId, string InstanceName, bool Success, string? Error,
|
|
IActorRef OriginalSender, bool IsRedeploy);
|
|
|
|
/// <summary>
|
|
/// A redeployment command buffered until the previous Instance Actor terminates.
|
|
/// </summary>
|
|
internal record PendingRedeploy(DeployInstanceCommand Command, IActorRef OriginalSender);
|
|
|
|
/// <summary>
|
|
/// SiteRuntime-021: internal message dispatched from
|
|
/// <see cref="HandleDeployArtifacts"/>'s off-thread persistence task back
|
|
/// onto the actor thread, so the DCL push (and its hash-cache mutation)
|
|
/// runs through <see cref="EnsureDclConnection"/> without crossing
|
|
/// thread-confinement boundaries.
|
|
/// </summary>
|
|
internal record ApplyArtifactDataConnectionsToDcl(
|
|
IReadOnlyList<Commons.Messages.Artifacts.DataConnectionArtifact> DataConnections);
|
|
}
|