ClusterClient Sender refs are temporary proxies — valid for immediate reply but not durable for future Tells. Events now flow as DebugStreamEvent through SiteCommunicationActor → ClusterClient → CentralCommunicationActor → bridge actor (same pattern as health reports). Also fix DebugStreamHub to use IHubContext for long-lived callbacks instead of transient hub instance.
317 lines
12 KiB
C#
317 lines
12 KiB
C#
using System.Collections.Immutable;
|
|
using Akka.Actor;
|
|
using Akka.Cluster.Tools.Client;
|
|
using Akka.Event;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using ScadaLink.Commons.Interfaces.Repositories;
|
|
using ScadaLink.Commons.Messages.Communication;
|
|
using ScadaLink.Commons.Messages.Health;
|
|
using ScadaLink.HealthMonitoring;
|
|
|
|
namespace ScadaLink.Communication.Actors;
|
|
|
|
/// <summary>
|
|
/// Abstraction for creating ClusterClient instances per site, enabling testability.
|
|
/// </summary>
|
|
public interface ISiteClientFactory
|
|
{
|
|
IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Default implementation that creates a real ClusterClient for each site.
|
|
/// </summary>
|
|
public class DefaultSiteClientFactory : ISiteClientFactory
|
|
{
|
|
public IActorRef Create(ActorSystem system, string siteId, ImmutableHashSet<ActorPath> contacts)
|
|
{
|
|
var settings = ClusterClientSettings.Create(system).WithInitialContacts(contacts);
|
|
return system.ActorOf(ClusterClient.Props(settings), $"site-client-{siteId}");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Central-side actor that routes messages from central to site clusters via ClusterClient.
|
|
/// Resolves site addresses from the database on a periodic refresh cycle and manages
|
|
/// per-site ClusterClient instances.
|
|
///
|
|
/// WP-4: All 8 message patterns routed through this actor.
|
|
/// WP-5: Ask timeout on connection drop (no central buffering). Debug streams killed on interruption.
|
|
/// </summary>
|
|
public class CentralCommunicationActor : ReceiveActor
|
|
{
|
|
private readonly ILoggingAdapter _log = Context.GetLogger();
|
|
private readonly IServiceProvider _serviceProvider;
|
|
private readonly ISiteClientFactory _siteClientFactory;
|
|
|
|
/// <summary>
|
|
/// Per-site ClusterClient instances and their contact addresses.
|
|
/// Maps SiteIdentifier → (ClusterClient actor, set of contact address strings).
|
|
/// Refreshed periodically via RefreshSiteAddresses.
|
|
/// </summary>
|
|
private Dictionary<string, (IActorRef Client, ImmutableHashSet<string> ContactAddresses)> _siteClients = new();
|
|
|
|
/// <summary>
|
|
/// Tracks active debug view subscriptions: correlationId → (siteId, subscriber).
|
|
/// Used to kill debug streams on site disconnection (WP-5).
|
|
/// </summary>
|
|
private readonly Dictionary<string, (string SiteId, IActorRef Subscriber)> _debugSubscriptions = new();
|
|
|
|
/// <summary>
|
|
/// Tracks in-progress deployments: deploymentId → siteId.
|
|
/// On central failover, in-progress deployments are treated as failed (WP-5).
|
|
/// </summary>
|
|
private readonly Dictionary<string, string> _inProgressDeployments = new();
|
|
|
|
private ICancelable? _refreshSchedule;
|
|
|
|
public CentralCommunicationActor(IServiceProvider serviceProvider, ISiteClientFactory siteClientFactory)
|
|
{
|
|
_serviceProvider = serviceProvider;
|
|
_siteClientFactory = siteClientFactory;
|
|
|
|
// Site address cache loaded from database
|
|
Receive<SiteAddressCacheLoaded>(HandleSiteAddressCacheLoaded);
|
|
|
|
// Periodic refresh trigger
|
|
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
|
|
|
|
// Health monitoring: heartbeats and health reports from sites
|
|
Receive<HeartbeatMessage>(HandleHeartbeat);
|
|
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
|
|
|
// Connection state changes
|
|
Receive<ConnectionStateChanged>(HandleConnectionStateChanged);
|
|
|
|
// Route enveloped messages to sites
|
|
Receive<SiteEnvelope>(HandleSiteEnvelope);
|
|
|
|
// Route debug stream events from sites to the correct bridge actor
|
|
Receive<Commons.Messages.DebugView.DebugStreamEvent>(HandleDebugStreamEvent);
|
|
}
|
|
|
|
private void HandleHeartbeat(HeartbeatMessage heartbeat)
|
|
{
|
|
// Forward heartbeat to parent for any interested central actors
|
|
Context.Parent.Tell(heartbeat);
|
|
}
|
|
|
|
private void HandleDebugStreamEvent(Commons.Messages.DebugView.DebugStreamEvent msg)
|
|
{
|
|
if (_debugSubscriptions.TryGetValue(msg.CorrelationId, out var entry))
|
|
{
|
|
entry.Subscriber.Tell(msg.Event);
|
|
}
|
|
else
|
|
{
|
|
_log.Debug("No debug subscription found for correlationId {0}, dropping event", msg.CorrelationId);
|
|
}
|
|
}
|
|
|
|
private void HandleSiteHealthReport(SiteHealthReport report)
|
|
{
|
|
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
|
if (aggregator != null)
|
|
{
|
|
aggregator.ProcessReport(report);
|
|
}
|
|
else
|
|
{
|
|
_log.Warning("ICentralHealthAggregator not available, dropping health report from site {0}", report.SiteId);
|
|
}
|
|
}
|
|
|
|
private void HandleConnectionStateChanged(ConnectionStateChanged msg)
|
|
{
|
|
if (!msg.IsConnected)
|
|
{
|
|
_log.Warning("Site {0} disconnected at {1}", msg.SiteId, msg.Timestamp);
|
|
|
|
// WP-5: Kill active debug streams for the disconnected site
|
|
var toRemove = _debugSubscriptions
|
|
.Where(kvp => kvp.Value.SiteId == msg.SiteId)
|
|
.ToList();
|
|
|
|
foreach (var kvp in toRemove)
|
|
{
|
|
_log.Info("Killing debug stream {0} for disconnected site {1}", kvp.Key, msg.SiteId);
|
|
kvp.Value.Subscriber.Tell(new DebugStreamTerminated(msg.SiteId, kvp.Key));
|
|
_debugSubscriptions.Remove(kvp.Key);
|
|
}
|
|
|
|
// WP-5: Mark in-progress deployments as failed
|
|
var failedDeployments = _inProgressDeployments
|
|
.Where(kvp => kvp.Value == msg.SiteId)
|
|
.Select(kvp => kvp.Key)
|
|
.ToList();
|
|
|
|
foreach (var deploymentId in failedDeployments)
|
|
{
|
|
_log.Warning("Deployment {0} to site {1} treated as failed due to disconnection",
|
|
deploymentId, msg.SiteId);
|
|
_inProgressDeployments.Remove(deploymentId);
|
|
}
|
|
|
|
// Note: Do NOT stop the ClusterClient — it handles reconnection internally
|
|
}
|
|
else
|
|
{
|
|
_log.Info("Site {0} connected at {1}", msg.SiteId, msg.Timestamp);
|
|
}
|
|
}
|
|
|
|
private void HandleSiteEnvelope(SiteEnvelope envelope)
|
|
{
|
|
if (!_siteClients.TryGetValue(envelope.SiteId, out var entry))
|
|
{
|
|
_log.Warning("No ClusterClient for site {0}, cannot route message {1}",
|
|
envelope.SiteId, envelope.Message.GetType().Name);
|
|
|
|
// The Ask will timeout on the caller side — no central buffering (WP-5)
|
|
return;
|
|
}
|
|
|
|
// Track debug subscriptions for cleanup on disconnect
|
|
TrackMessageForCleanup(envelope);
|
|
|
|
// Route via ClusterClient — Sender is preserved for Ask response routing
|
|
entry.Client.Tell(
|
|
new ClusterClient.Send("/user/site-communication", envelope.Message),
|
|
Sender);
|
|
}
|
|
|
|
private void LoadSiteAddressesFromDb()
|
|
{
|
|
var self = Self;
|
|
Task.Run(async () =>
|
|
{
|
|
using var scope = _serviceProvider.CreateScope();
|
|
var repo = scope.ServiceProvider.GetRequiredService<ISiteRepository>();
|
|
var sites = await repo.GetAllSitesAsync();
|
|
|
|
var contacts = new Dictionary<string, List<string>>();
|
|
foreach (var site in sites)
|
|
{
|
|
var addrs = new List<string>();
|
|
if (!string.IsNullOrWhiteSpace(site.NodeAAddress))
|
|
{
|
|
var addr = site.NodeAAddress;
|
|
// Strip actor path suffix if present (legacy format)
|
|
var idx = addr.IndexOf("/user/");
|
|
if (idx > 0) addr = addr.Substring(0, idx);
|
|
addrs.Add(addr);
|
|
}
|
|
if (!string.IsNullOrWhiteSpace(site.NodeBAddress))
|
|
{
|
|
var addr = site.NodeBAddress;
|
|
var idx = addr.IndexOf("/user/");
|
|
if (idx > 0) addr = addr.Substring(0, idx);
|
|
addrs.Add(addr);
|
|
}
|
|
if (addrs.Count > 0)
|
|
contacts[site.SiteIdentifier] = addrs;
|
|
}
|
|
|
|
return new SiteAddressCacheLoaded(contacts);
|
|
}).PipeTo(self);
|
|
}
|
|
|
|
private void HandleSiteAddressCacheLoaded(SiteAddressCacheLoaded msg)
|
|
{
|
|
var newSiteIds = msg.SiteContacts.Keys.ToHashSet();
|
|
var existingSiteIds = _siteClients.Keys.ToHashSet();
|
|
|
|
// Stop ClusterClients for removed sites
|
|
foreach (var removed in existingSiteIds.Except(newSiteIds))
|
|
{
|
|
_log.Info("Stopping ClusterClient for removed site {0}", removed);
|
|
Context.Stop(_siteClients[removed].Client);
|
|
_siteClients.Remove(removed);
|
|
}
|
|
|
|
// Add or update
|
|
foreach (var (siteId, addresses) in msg.SiteContacts)
|
|
{
|
|
var contactPaths = addresses
|
|
.Select(a => ActorPath.Parse($"{a}/system/receptionist"))
|
|
.ToImmutableHashSet();
|
|
|
|
var contactStrings = addresses.ToImmutableHashSet();
|
|
|
|
// Skip if unchanged
|
|
if (_siteClients.TryGetValue(siteId, out var existing) && existing.ContactAddresses.SetEquals(contactStrings))
|
|
continue;
|
|
|
|
// Stop old client if addresses changed
|
|
if (_siteClients.ContainsKey(siteId))
|
|
{
|
|
_log.Info("Updating ClusterClient for site {0} (addresses changed)", siteId);
|
|
Context.Stop(_siteClients[siteId].Client);
|
|
}
|
|
|
|
var client = _siteClientFactory.Create(Context.System, siteId, contactPaths);
|
|
_siteClients[siteId] = (client, contactStrings);
|
|
_log.Info("Created ClusterClient for site {0} with {1} contact(s)", siteId, addresses.Count);
|
|
}
|
|
|
|
_log.Info("Site ClusterClient cache refreshed with {0} site(s)", _siteClients.Count);
|
|
}
|
|
|
|
private void TrackMessageForCleanup(SiteEnvelope envelope)
|
|
{
|
|
switch (envelope.Message)
|
|
{
|
|
case Commons.Messages.DebugView.SubscribeDebugViewRequest sub:
|
|
_debugSubscriptions[sub.CorrelationId] = (envelope.SiteId, Sender);
|
|
break;
|
|
|
|
case Commons.Messages.DebugView.UnsubscribeDebugViewRequest unsub:
|
|
_debugSubscriptions.Remove(unsub.CorrelationId);
|
|
break;
|
|
|
|
case Commons.Messages.Deployment.DeployInstanceCommand deploy:
|
|
_inProgressDeployments[deploy.DeploymentId] = envelope.SiteId;
|
|
break;
|
|
}
|
|
}
|
|
|
|
protected override void PreStart()
|
|
{
|
|
_log.Info("CentralCommunicationActor started");
|
|
|
|
// Schedule periodic refresh of site addresses from the database
|
|
_refreshSchedule = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
|
TimeSpan.Zero,
|
|
TimeSpan.FromSeconds(60),
|
|
Self,
|
|
new RefreshSiteAddresses(),
|
|
ActorRefs.NoSender);
|
|
}
|
|
|
|
protected override void PostStop()
|
|
{
|
|
_log.Info("CentralCommunicationActor stopped. In-progress deployments treated as failed (WP-5).");
|
|
_refreshSchedule?.Cancel();
|
|
// On central failover, all in-progress deployments are failed
|
|
_inProgressDeployments.Clear();
|
|
_debugSubscriptions.Clear();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Command to trigger a refresh of site addresses from the database.
|
|
/// </summary>
|
|
public record RefreshSiteAddresses;
|
|
|
|
/// <summary>
|
|
/// Internal message carrying the loaded site contact data from the database.
|
|
/// ClusterClient creation happens on the actor thread in HandleSiteAddressCacheLoaded.
|
|
/// </summary>
|
|
internal record SiteAddressCacheLoaded(Dictionary<string, List<string>> SiteContacts);
|
|
|
|
/// <summary>
|
|
/// Notification sent to debug view subscribers when the stream is terminated
|
|
/// due to site disconnection (WP-5).
|
|
/// </summary>
|
|
public record DebugStreamTerminated(string SiteId, string CorrelationId);
|