fix: only active singleton node sends health reports
Both nodes of a site cluster were sending health reports. The standby node (without the DeploymentManager singleton) reported 0 instances and no connections, overwriting the active node's data in the aggregator. Added IsActiveNode flag to ISiteHealthCollector, set by DeploymentManagerActor on PreStart/PostStop. HealthReportSender skips sending when the node is not active. Also ensured EnsureDclConnections is called during startup batch creation so data connections survive container restarts.
This commit is contained in:
@@ -49,9 +49,10 @@ public class HealthReportSender : BackgroundService
|
||||
{
|
||||
try
|
||||
{
|
||||
// TODO: Wire S&F buffer depths when StoreAndForward service is available in DI
|
||||
// e.g., var depths = await _bufferDepthProvider.GetDepthsAsync();
|
||||
// _collector.SetStoreAndForwardDepths(depths);
|
||||
// Only the active node (running the DeploymentManager singleton) sends health reports.
|
||||
// The standby node has no instance/connection data and would overwrite the active's report.
|
||||
if (!_collector.IsActiveNode)
|
||||
continue;
|
||||
|
||||
var seq = Interlocked.Increment(ref _sequenceNumber);
|
||||
var report = _collector.CollectReport(_siteId);
|
||||
|
||||
@@ -17,5 +17,7 @@ public interface ISiteHealthCollector
|
||||
void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved);
|
||||
void SetStoreAndForwardDepths(IReadOnlyDictionary<string, int> depths);
|
||||
void SetInstanceCounts(int deployed, int enabled, int disabled);
|
||||
void SetActiveNode(bool isActive);
|
||||
bool IsActiveNode { get; }
|
||||
SiteHealthReport CollectReport(string siteId);
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
private readonly ConcurrentDictionary<string, TagResolutionStatus> _tagResolutionCounts = new();
|
||||
private IReadOnlyDictionary<string, int> _sfBufferDepths = new Dictionary<string, int>();
|
||||
private int _deployedInstanceCount, _enabledInstanceCount, _disabledInstanceCount;
|
||||
private volatile bool _isActiveNode;
|
||||
|
||||
/// <summary>
|
||||
/// Increment the script error counter. Covers unhandled exceptions,
|
||||
@@ -90,6 +91,10 @@ public class SiteHealthCollector : ISiteHealthCollector
|
||||
Interlocked.Exchange(ref _disabledInstanceCount, disabled);
|
||||
}
|
||||
|
||||
public void SetActiveNode(bool isActive) => _isActiveNode = isActive;
|
||||
|
||||
public bool IsActiveNode => _isActiveNode;
|
||||
|
||||
/// <summary>
|
||||
/// Collect the current health report for the site and reset interval counters.
|
||||
/// Connection statuses and tag resolution counts are NOT reset (they reflect current state).
|
||||
|
||||
Reference in New Issue
Block a user