fix(health): route site heartbeats into the aggregator
CentralCommunicationActor.HandleHeartbeat was forwarding each incoming HeartbeatMessage to Context.Parent, which resolves to the /user guardian — a non-actor. Every site heartbeat went straight to dead letters (~1026 per central node per 30 minutes at the default ~2s interval across three sites). The aggregator now exposes MarkHeartbeat(siteId, receivedAt) which bumps LastReportReceivedAt on already-known sites (and clears IsOnline if it had flipped) without touching LatestReport. Heartbeats from unregistered sites are dropped — first registration still happens on the first full report. CentralCommunicationActor calls this in place of the no-op Tell. The result: heartbeats now serve their stated health-monitoring purpose (per CLAUDE.md) by keeping a site marked online between the 30s full reports if a single report is briefly delayed, and the dead letter noise disappears entirely.
This commit is contained in:
@@ -100,8 +100,8 @@ public class CentralCommunicationActor : ReceiveActor
|
|||||||
|
|
||||||
private void HandleHeartbeat(HeartbeatMessage heartbeat)
|
private void HandleHeartbeat(HeartbeatMessage heartbeat)
|
||||||
{
|
{
|
||||||
// Forward heartbeat to parent for any interested central actors
|
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
||||||
Context.Parent.Tell(heartbeat);
|
aggregator?.MarkHeartbeat(heartbeat.SiteId, heartbeat.Timestamp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
|
|||||||
@@ -76,6 +76,26 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||||
|
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
||||||
|
/// full report from are ignored — registration only happens on report.
|
||||||
|
/// </summary>
|
||||||
|
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||||
|
{
|
||||||
|
if (!_siteStates.TryGetValue(siteId, out var state))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (receivedAt > state.LastReportReceivedAt)
|
||||||
|
state.LastReportReceivedAt = receivedAt;
|
||||||
|
|
||||||
|
if (!state.IsOnline)
|
||||||
|
{
|
||||||
|
state.IsOnline = true;
|
||||||
|
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the current health state for all known sites.
|
/// Get the current health state for all known sites.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -9,6 +9,15 @@ namespace ScadaLink.HealthMonitoring;
|
|||||||
public interface ICentralHealthAggregator
|
public interface ICentralHealthAggregator
|
||||||
{
|
{
|
||||||
void ProcessReport(SiteHealthReport report);
|
void ProcessReport(SiteHealthReport report);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||||
|
/// SiteHealthReport. Used to keep a site marked online between full
|
||||||
|
/// 30s reports when ~2s heartbeats are arriving — protects against the
|
||||||
|
/// 60s offline threshold firing on a transiently delayed report.
|
||||||
|
/// </summary>
|
||||||
|
void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
|
||||||
|
|
||||||
IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates();
|
IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates();
|
||||||
SiteHealthState? GetSiteState(string siteId);
|
SiteHealthState? GetSiteState(string siteId);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ using ScadaLink.Commons.Messages.Deployment;
|
|||||||
using ScadaLink.Commons.Messages.DebugView;
|
using ScadaLink.Commons.Messages.DebugView;
|
||||||
using ScadaLink.Commons.Messages.Health;
|
using ScadaLink.Commons.Messages.Health;
|
||||||
using ScadaLink.Communication.Actors;
|
using ScadaLink.Communication.Actors;
|
||||||
|
using ScadaLink.HealthMonitoring;
|
||||||
using Akka.TestKit;
|
using Akka.TestKit;
|
||||||
|
|
||||||
namespace ScadaLink.Communication.Tests;
|
namespace ScadaLink.Communication.Tests;
|
||||||
@@ -140,25 +141,27 @@ public class CentralCommunicationActorTests : TestKit
|
|||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void Heartbeat_ForwardedToParent()
|
public void Heartbeat_BumpsAggregatorTimestamp()
|
||||||
{
|
{
|
||||||
var mockRepo = Substitute.For<ISiteRepository>();
|
var mockRepo = Substitute.For<ISiteRepository>();
|
||||||
mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
|
mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
|
||||||
.Returns(new List<Site>());
|
.Returns(new List<Site>());
|
||||||
|
|
||||||
|
var aggregator = Substitute.For<ICentralHealthAggregator>();
|
||||||
|
|
||||||
var services = new ServiceCollection();
|
var services = new ServiceCollection();
|
||||||
services.AddScoped(_ => mockRepo);
|
services.AddScoped(_ => mockRepo);
|
||||||
|
services.AddSingleton(aggregator);
|
||||||
var sp = services.BuildServiceProvider();
|
var sp = services.BuildServiceProvider();
|
||||||
|
|
||||||
var siteClientFactory = Substitute.For<ISiteClientFactory>();
|
var siteClientFactory = Substitute.For<ISiteClientFactory>();
|
||||||
var parentProbe = CreateTestProbe();
|
var centralActor = Sys.ActorOf(
|
||||||
var centralActor = parentProbe.ChildActorOf(
|
|
||||||
Props.Create(() => new CentralCommunicationActor(sp, siteClientFactory)));
|
Props.Create(() => new CentralCommunicationActor(sp, siteClientFactory)));
|
||||||
|
|
||||||
var heartbeat = new HeartbeatMessage("site1", "host1", true, DateTimeOffset.UtcNow);
|
var timestamp = DateTimeOffset.UtcNow;
|
||||||
centralActor.Tell(heartbeat);
|
centralActor.Tell(new HeartbeatMessage("site1", "host1", true, timestamp));
|
||||||
|
|
||||||
parentProbe.ExpectMsg<HeartbeatMessage>(msg => msg.SiteId == "site1");
|
AwaitAssert(() => aggregator.Received(1).MarkHeartbeat("site1", timestamp));
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
@@ -25,6 +25,7 @@
|
|||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="../../src/ScadaLink.Communication/ScadaLink.Communication.csproj" />
|
<ProjectReference Include="../../src/ScadaLink.Communication/ScadaLink.Communication.csproj" />
|
||||||
<ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
<ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||||
|
<ProjectReference Include="../../src/ScadaLink.HealthMonitoring/ScadaLink.HealthMonitoring.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
Reference in New Issue
Block a user