fix(health): replicate site health reports between central nodes
CentralHealthAggregator is a per-node hosted singleton, but site health reports flow through ClusterClient which round-robins each report to one central node only. The other node's aggregator never saw those reports and marked sites offline at the 60s threshold — sites constantly flapped between online and offline on the monitoring page. On receive, the active CentralCommunicationActor now republishes a SiteHealthReportReplica wrapper on a DistributedPubSub topic. Both central nodes subscribe to the topic and process replicas through a dedicated path that updates the local aggregator without re-broadcasting (avoids fan-out loops). The aggregator's existing sequence-number idempotency makes self-delivery a cheap no-op. DistributedPubSubExtensionProvider is now listed in the HOCON `akka.extensions` block so the mediator is initialised at cluster start, eliminating a race where the first Subscribe arrived before the extension was loaded.
This commit is contained in:
@@ -21,3 +21,13 @@ public record SiteHealthReport(
|
|||||||
IReadOnlyDictionary<string, TagQualityCounts>? DataConnectionTagQuality = null,
|
IReadOnlyDictionary<string, TagQualityCounts>? DataConnectionTagQuality = null,
|
||||||
int ParkedMessageCount = 0,
|
int ParkedMessageCount = 0,
|
||||||
IReadOnlyList<NodeStatus>? ClusterNodes = null);
|
IReadOnlyList<NodeStatus>? ClusterNodes = null);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Broadcast wrapper used between central nodes to keep per-node
|
||||||
|
/// CentralHealthAggregator state in sync. ClusterClient load-balances each
|
||||||
|
/// incoming SiteHealthReport to one central node; that node re-publishes
|
||||||
|
/// this wrapper on a DistributedPubSub topic so the peer node's aggregator
|
||||||
|
/// also processes the report (idempotently — sequence numbers guard against
|
||||||
|
/// double-counting).
|
||||||
|
/// </summary>
|
||||||
|
public record SiteHealthReportReplica(SiteHealthReport Report);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
using System.Collections.Immutable;
|
using System.Collections.Immutable;
|
||||||
using Akka.Actor;
|
using Akka.Actor;
|
||||||
using Akka.Cluster.Tools.Client;
|
using Akka.Cluster.Tools.Client;
|
||||||
|
using Akka.Cluster.Tools.PublishSubscribe;
|
||||||
using Akka.Event;
|
using Akka.Event;
|
||||||
using Microsoft.Extensions.DependencyInjection;
|
using Microsoft.Extensions.DependencyInjection;
|
||||||
using ScadaLink.Commons.Interfaces.Repositories;
|
using ScadaLink.Commons.Interfaces.Repositories;
|
||||||
@@ -65,6 +66,13 @@ public class CentralCommunicationActor : ReceiveActor
|
|||||||
|
|
||||||
private ICancelable? _refreshSchedule;
|
private ICancelable? _refreshSchedule;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// DistributedPubSub topic used to fan health reports out to the peer
|
||||||
|
/// central node so both per-node aggregators stay in sync. See
|
||||||
|
/// <see cref="SiteHealthReportReplica"/> for the protocol rationale.
|
||||||
|
/// </summary>
|
||||||
|
private const string HealthReportTopic = "site-health-replica";
|
||||||
|
|
||||||
public CentralCommunicationActor(IServiceProvider serviceProvider, ISiteClientFactory siteClientFactory)
|
public CentralCommunicationActor(IServiceProvider serviceProvider, ISiteClientFactory siteClientFactory)
|
||||||
{
|
{
|
||||||
_serviceProvider = serviceProvider;
|
_serviceProvider = serviceProvider;
|
||||||
@@ -79,6 +87,8 @@ public class CentralCommunicationActor : ReceiveActor
|
|||||||
// Health monitoring: heartbeats and health reports from sites
|
// Health monitoring: heartbeats and health reports from sites
|
||||||
Receive<HeartbeatMessage>(HandleHeartbeat);
|
Receive<HeartbeatMessage>(HandleHeartbeat);
|
||||||
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
||||||
|
Receive<SiteHealthReportReplica>(r => ProcessLocally(r.Report));
|
||||||
|
Receive<SubscribeAck>(_ => { /* DistributedPubSub subscribe confirmation */ });
|
||||||
|
|
||||||
// Connection state changes
|
// Connection state changes
|
||||||
Receive<ConnectionStateChanged>(HandleConnectionStateChanged);
|
Receive<ConnectionStateChanged>(HandleConnectionStateChanged);
|
||||||
@@ -94,7 +104,32 @@ public class CentralCommunicationActor : ReceiveActor
|
|||||||
Context.Parent.Tell(heartbeat);
|
Context.Parent.Tell(heartbeat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Handles a report delivered directly from a site (via ClusterClient):
|
||||||
|
/// process locally, then fan out to the peer central node so its
|
||||||
|
/// aggregator stays in sync.
|
||||||
|
/// </summary>
|
||||||
private void HandleSiteHealthReport(SiteHealthReport report)
|
private void HandleSiteHealthReport(SiteHealthReport report)
|
||||||
|
{
|
||||||
|
ProcessLocally(report);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
DistributedPubSub.Get(Context.System).Mediator.Tell(
|
||||||
|
new Publish(HealthReportTopic, new SiteHealthReportReplica(report)));
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
// No-op in non-clustered hosts (TestKit).
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Applies a report to the local aggregator without re-broadcasting.
|
||||||
|
/// Used for both site-originated reports and peer-replicated ones — the
|
||||||
|
/// aggregator is idempotent via sequence-number comparison.
|
||||||
|
/// </summary>
|
||||||
|
private void ProcessLocally(SiteHealthReport report)
|
||||||
{
|
{
|
||||||
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
||||||
if (aggregator != null)
|
if (aggregator != null)
|
||||||
@@ -265,6 +300,20 @@ public class CentralCommunicationActor : ReceiveActor
|
|||||||
{
|
{
|
||||||
_log.Info("CentralCommunicationActor started");
|
_log.Info("CentralCommunicationActor started");
|
||||||
|
|
||||||
|
// Subscribe to the peer-replication topic so we receive health reports
|
||||||
|
// delivered to the other central node and keep our local aggregator
|
||||||
|
// in sync (ClusterClient load-balances reports across nodes).
|
||||||
|
// Tolerant of non-clustered hosts (TestKit) where the extension is absent.
|
||||||
|
try
|
||||||
|
{
|
||||||
|
DistributedPubSub.Get(Context.System).Mediator.Tell(
|
||||||
|
new Subscribe(HealthReportTopic, Self));
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_log.Debug("DistributedPubSub not available — peer health replication disabled: {0}", ex.Message);
|
||||||
|
}
|
||||||
|
|
||||||
// Schedule periodic refresh of site addresses from the database
|
// Schedule periodic refresh of site addresses from the database
|
||||||
_refreshSchedule = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
_refreshSchedule = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||||
TimeSpan.Zero,
|
TimeSpan.Zero,
|
||||||
|
|||||||
@@ -69,6 +69,9 @@ public class AkkaHostedService : IHostedService
|
|||||||
|
|
||||||
var hocon = $@"
|
var hocon = $@"
|
||||||
akka {{
|
akka {{
|
||||||
|
extensions = [
|
||||||
|
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
|
||||||
|
]
|
||||||
actor {{
|
actor {{
|
||||||
provider = cluster
|
provider = cluster
|
||||||
}}
|
}}
|
||||||
|
|||||||
Reference in New Issue
Block a user