fix(health): route site heartbeats into the aggregator

CentralCommunicationActor.HandleHeartbeat was forwarding each incoming
HeartbeatMessage to Context.Parent, which resolves to the /user
guardian — a non-actor. Every site heartbeat went straight to dead
letters (~1026 per central node per 30 minutes at the default ~2s
interval across three sites).

The aggregator now exposes MarkHeartbeat(siteId, receivedAt) which
bumps LastReportReceivedAt on already-known sites (and clears IsOnline
if it had flipped) without touching LatestReport. Heartbeats from
unregistered sites are dropped — first registration still happens on
the first full report. CentralCommunicationActor calls this in place
of the no-op Tell.

The result: heartbeats now serve their stated health-monitoring
purpose (per CLAUDE.md) by keeping a site marked online between the
30s full reports if a single report is briefly delayed, and the dead
letter noise disappears entirely.
This commit is contained in:
Joseph Doherty
2026-05-13 08:11:43 -04:00
parent 7bba48a14a
commit f66dc031a4
5 changed files with 41 additions and 8 deletions

View File

@@ -100,8 +100,8 @@ public class CentralCommunicationActor : ReceiveActor
private void HandleHeartbeat(HeartbeatMessage heartbeat) private void HandleHeartbeat(HeartbeatMessage heartbeat)
{ {
// Forward heartbeat to parent for any interested central actors var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
Context.Parent.Tell(heartbeat); aggregator?.MarkHeartbeat(heartbeat.SiteId, heartbeat.Timestamp);
} }
/// <summary> /// <summary>

View File

@@ -76,6 +76,26 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
}); });
} }
/// <summary>
/// Bumps the last-seen timestamp for a site already known via a prior
/// SiteHealthReport. Heartbeats from sites we have not yet received a
/// full report from are ignored — registration only happens on report.
/// </summary>
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
{
if (!_siteStates.TryGetValue(siteId, out var state))
return;
if (receivedAt > state.LastReportReceivedAt)
state.LastReportReceivedAt = receivedAt;
if (!state.IsOnline)
{
state.IsOnline = true;
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
}
}
/// <summary> /// <summary>
/// Get the current health state for all known sites. /// Get the current health state for all known sites.
/// </summary> /// </summary>

View File

@@ -9,6 +9,15 @@ namespace ScadaLink.HealthMonitoring;
public interface ICentralHealthAggregator public interface ICentralHealthAggregator
{ {
void ProcessReport(SiteHealthReport report); void ProcessReport(SiteHealthReport report);
/// <summary>
/// Bumps the last-seen timestamp for a site already known via a prior
/// SiteHealthReport. Used to keep a site marked online between full
/// 30s reports when ~2s heartbeats are arriving — protects against the
/// 60s offline threshold firing on a transiently delayed report.
/// </summary>
void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates(); IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates();
SiteHealthState? GetSiteState(string siteId); SiteHealthState? GetSiteState(string siteId);
} }

View File

@@ -11,6 +11,7 @@ using ScadaLink.Commons.Messages.Deployment;
using ScadaLink.Commons.Messages.DebugView; using ScadaLink.Commons.Messages.DebugView;
using ScadaLink.Commons.Messages.Health; using ScadaLink.Commons.Messages.Health;
using ScadaLink.Communication.Actors; using ScadaLink.Communication.Actors;
using ScadaLink.HealthMonitoring;
using Akka.TestKit; using Akka.TestKit;
namespace ScadaLink.Communication.Tests; namespace ScadaLink.Communication.Tests;
@@ -140,25 +141,27 @@ public class CentralCommunicationActorTests : TestKit
} }
[Fact] [Fact]
public void Heartbeat_ForwardedToParent() public void Heartbeat_BumpsAggregatorTimestamp()
{ {
var mockRepo = Substitute.For<ISiteRepository>(); var mockRepo = Substitute.For<ISiteRepository>();
mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>()) mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
.Returns(new List<Site>()); .Returns(new List<Site>());
var aggregator = Substitute.For<ICentralHealthAggregator>();
var services = new ServiceCollection(); var services = new ServiceCollection();
services.AddScoped(_ => mockRepo); services.AddScoped(_ => mockRepo);
services.AddSingleton(aggregator);
var sp = services.BuildServiceProvider(); var sp = services.BuildServiceProvider();
var siteClientFactory = Substitute.For<ISiteClientFactory>(); var siteClientFactory = Substitute.For<ISiteClientFactory>();
var parentProbe = CreateTestProbe(); var centralActor = Sys.ActorOf(
var centralActor = parentProbe.ChildActorOf(
Props.Create(() => new CentralCommunicationActor(sp, siteClientFactory))); Props.Create(() => new CentralCommunicationActor(sp, siteClientFactory)));
var heartbeat = new HeartbeatMessage("site1", "host1", true, DateTimeOffset.UtcNow); var timestamp = DateTimeOffset.UtcNow;
centralActor.Tell(heartbeat); centralActor.Tell(new HeartbeatMessage("site1", "host1", true, timestamp));
parentProbe.ExpectMsg<HeartbeatMessage>(msg => msg.SiteId == "site1"); AwaitAssert(() => aggregator.Received(1).MarkHeartbeat("site1", timestamp));
} }
[Fact] [Fact]

View File

@@ -25,6 +25,7 @@
<ItemGroup> <ItemGroup>
<ProjectReference Include="../../src/ScadaLink.Communication/ScadaLink.Communication.csproj" /> <ProjectReference Include="../../src/ScadaLink.Communication/ScadaLink.Communication.csproj" />
<ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" /> <ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
<ProjectReference Include="../../src/ScadaLink.HealthMonitoring/ScadaLink.HealthMonitoring.csproj" />
</ItemGroup> </ItemGroup>
</Project> </Project>