From f66dc031a4e8f29f9939d5fb359d5f6c2de69754 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 13 May 2026 08:11:43 -0400 Subject: [PATCH] fix(health): route site heartbeats into the aggregator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CentralCommunicationActor.HandleHeartbeat was forwarding each incoming HeartbeatMessage to Context.Parent, which resolves to the /user guardian — a non-actor. Every site heartbeat went straight to dead letters (~1026 per central node per 30 minutes at the default ~2s interval across three sites). The aggregator now exposes MarkHeartbeat(siteId, receivedAt) which bumps LastReportReceivedAt on already-known sites (and clears IsOnline if it had flipped) without touching LatestReport. Heartbeats from unregistered sites are dropped — first registration still happens on the first full report. CentralCommunicationActor calls this in place of the no-op Tell. The result: heartbeats now serve their stated health-monitoring purpose (per CLAUDE.md) by keeping a site marked online between the 30s full reports if a single report is briefly delayed, and the dead letter noise disappears entirely. --- .../Actors/CentralCommunicationActor.cs | 4 ++-- .../CentralHealthAggregator.cs | 20 +++++++++++++++++++ .../ICentralHealthAggregator.cs | 9 +++++++++ .../CentralCommunicationActorTests.cs | 15 ++++++++------ .../ScadaLink.Communication.Tests.csproj | 1 + 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs index 13f12d5..7d027df 100644 --- a/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs +++ b/src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs @@ -100,8 +100,8 @@ public class CentralCommunicationActor : ReceiveActor private void HandleHeartbeat(HeartbeatMessage heartbeat) { - // Forward heartbeat to parent for any interested central actors - Context.Parent.Tell(heartbeat); + var aggregator = _serviceProvider.GetService(); + aggregator?.MarkHeartbeat(heartbeat.SiteId, heartbeat.Timestamp); } /// diff --git a/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs b/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs index 67518e4..25601cb 100644 --- a/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs +++ b/src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs @@ -76,6 +76,26 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat }); } + /// + /// Bumps the last-seen timestamp for a site already known via a prior + /// SiteHealthReport. Heartbeats from sites we have not yet received a + /// full report from are ignored — registration only happens on report. + /// + public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) + { + if (!_siteStates.TryGetValue(siteId, out var state)) + return; + + if (receivedAt > state.LastReportReceivedAt) + state.LastReportReceivedAt = receivedAt; + + if (!state.IsOnline) + { + state.IsOnline = true; + _logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId); + } + } + /// /// Get the current health state for all known sites. /// diff --git a/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs b/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs index 4335caa..fcfcacc 100644 --- a/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs +++ b/src/ScadaLink.HealthMonitoring/ICentralHealthAggregator.cs @@ -9,6 +9,15 @@ namespace ScadaLink.HealthMonitoring; public interface ICentralHealthAggregator { void ProcessReport(SiteHealthReport report); + + /// + /// Bumps the last-seen timestamp for a site already known via a prior + /// SiteHealthReport. Used to keep a site marked online between full + /// 30s reports when ~2s heartbeats are arriving — protects against the + /// 60s offline threshold firing on a transiently delayed report. + /// + void MarkHeartbeat(string siteId, DateTimeOffset receivedAt); + IReadOnlyDictionary GetAllSiteStates(); SiteHealthState? GetSiteState(string siteId); } diff --git a/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs b/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs index 010ee3b..e0301f0 100644 --- a/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs +++ b/tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs @@ -11,6 +11,7 @@ using ScadaLink.Commons.Messages.Deployment; using ScadaLink.Commons.Messages.DebugView; using ScadaLink.Commons.Messages.Health; using ScadaLink.Communication.Actors; +using ScadaLink.HealthMonitoring; using Akka.TestKit; namespace ScadaLink.Communication.Tests; @@ -140,25 +141,27 @@ public class CentralCommunicationActorTests : TestKit } [Fact] - public void Heartbeat_ForwardedToParent() + public void Heartbeat_BumpsAggregatorTimestamp() { var mockRepo = Substitute.For(); mockRepo.GetAllSitesAsync(Arg.Any()) .Returns(new List()); + var aggregator = Substitute.For(); + var services = new ServiceCollection(); services.AddScoped(_ => mockRepo); + services.AddSingleton(aggregator); var sp = services.BuildServiceProvider(); var siteClientFactory = Substitute.For(); - var parentProbe = CreateTestProbe(); - var centralActor = parentProbe.ChildActorOf( + var centralActor = Sys.ActorOf( Props.Create(() => new CentralCommunicationActor(sp, siteClientFactory))); - var heartbeat = new HeartbeatMessage("site1", "host1", true, DateTimeOffset.UtcNow); - centralActor.Tell(heartbeat); + var timestamp = DateTimeOffset.UtcNow; + centralActor.Tell(new HeartbeatMessage("site1", "host1", true, timestamp)); - parentProbe.ExpectMsg(msg => msg.SiteId == "site1"); + AwaitAssert(() => aggregator.Received(1).MarkHeartbeat("site1", timestamp)); } [Fact] diff --git a/tests/ScadaLink.Communication.Tests/ScadaLink.Communication.Tests.csproj b/tests/ScadaLink.Communication.Tests/ScadaLink.Communication.Tests.csproj index 2a75120..abc59aa 100644 --- a/tests/ScadaLink.Communication.Tests/ScadaLink.Communication.Tests.csproj +++ b/tests/ScadaLink.Communication.Tests/ScadaLink.Communication.Tests.csproj @@ -25,6 +25,7 @@ +