fix(health-monitoring): resolve HealthMonitoring-001/002 — populate S&F buffer depth, make SiteHealthState immutable

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent 340a70f0e6
commit 7d7214a4ca
7 changed files with 287 additions and 60 deletions

View File

@@ -33,16 +33,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
/// Only replaces stored state if incoming sequence number is greater than last received.
/// Auto-marks previously offline sites as online.
/// </summary>
/// <remarks>
/// <see cref="SiteHealthState"/> is immutable: each transition produces a brand-new
/// instance, and the dictionary entry is replaced atomically. The mutation is
/// performed in a compare-and-swap retry loop rather than via the
/// <c>AddOrUpdate</c> update delegate so the sequence-number guard and the field
/// writes are evaluated as a single atomic step against the value actually
/// installed — the <c>AddOrUpdate</c> delegate may be invoked more than once
/// under contention and could otherwise act on a value that is then discarded.
/// </remarks>
public void ProcessReport(SiteHealthReport report)
{
var now = _timeProvider.GetUtcNow();
_siteStates.AddOrUpdate(
report.SiteId,
_ =>
while (true)
{
if (!_siteStates.TryGetValue(report.SiteId, out var existing))
{
_logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
return new SiteHealthState
var registered = new SiteHealthState
{
SiteId = report.SiteId,
LatestReport = report,
@@ -51,50 +59,84 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
LastSequenceNumber = report.SequenceNumber,
IsOnline = true
};
},
(_, existing) =>
if (_siteStates.TryAdd(report.SiteId, registered))
{
_logger.LogInformation(
"Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
return;
}
// Lost the race — another thread registered first; retry as an update.
continue;
}
if (report.SequenceNumber <= existing.LastSequenceNumber)
{
if (report.SequenceNumber <= existing.LastSequenceNumber)
_logger.LogDebug(
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
return;
}
var updated = existing with
{
LatestReport = report,
LastReportReceivedAt = now,
LastHeartbeatAt = now,
LastSequenceNumber = report.SequenceNumber,
IsOnline = true
};
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
{
if (!existing.IsOnline)
{
_logger.LogDebug(
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
return existing;
_logger.LogInformation(
"Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
}
return;
}
var wasOffline = !existing.IsOnline;
existing.LatestReport = report;
existing.LastReportReceivedAt = now;
existing.LastHeartbeatAt = now;
existing.LastSequenceNumber = report.SequenceNumber;
existing.IsOnline = true;
if (wasOffline)
{
_logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
}
return existing;
});
// CAS lost — the entry changed under us; retry with the fresh value.
}
}
/// <summary>
/// Bumps the last-seen timestamp for a site already known via a prior
/// SiteHealthReport. Heartbeats from sites we have not yet received a
/// full report from are ignored — registration only happens on report.
/// The update is an atomic compare-and-swap of the immutable state.
/// </summary>
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
{
if (!_siteStates.TryGetValue(siteId, out var state))
return;
if (receivedAt > state.LastHeartbeatAt)
state.LastHeartbeatAt = receivedAt;
if (!state.IsOnline)
while (true)
{
state.IsOnline = true;
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
if (!_siteStates.TryGetValue(siteId, out var existing))
return;
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
? receivedAt
: existing.LastHeartbeatAt;
// Nothing to change — avoid a needless swap.
if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
return;
var updated = existing with
{
LastHeartbeatAt = newHeartbeat,
IsOnline = true
};
if (_siteStates.TryUpdate(siteId, updated, existing))
{
if (!existing.IsOnline)
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
return;
}
// CAS lost — retry with the fresh value.
}
}
@@ -143,13 +185,20 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
var state = kvp.Value;
if (!state.IsOnline) continue;
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
// healthy site node, so OfflineTimeout only fires when no node
// can reach central, not during single-node failovers.
// Use LastHeartbeatAt — heartbeats arrive frequently from any
// healthy site node (cadence owned by Cluster Infrastructure /
// SiteCommunicationActor), so OfflineTimeout only fires when no
// node can reach central, not during single-node failovers.
var elapsed = now - state.LastHeartbeatAt;
if (elapsed > _options.OfflineTimeout)
if (elapsed <= _options.OfflineTimeout)
continue;
// Atomically swap to an offline copy. If the CAS loses to a
// concurrent report/heartbeat the site was just heard from, so
// leaving it online is the correct outcome — no retry needed.
var offline = state with { IsOnline = false };
if (_siteStates.TryUpdate(kvp.Key, offline, state))
{
state.IsOnline = false;
_logger.LogWarning(
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);