fix(health-monitoring): resolve HealthMonitoring-001/002 — populate S&F buffer depth, make SiteHealthState immutable
This commit is contained in:
@@ -33,16 +33,24 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
/// Only replaces stored state if incoming sequence number is greater than last received.
|
||||
/// Auto-marks previously offline sites as online.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <see cref="SiteHealthState"/> is immutable: each transition produces a brand-new
|
||||
/// instance, and the dictionary entry is replaced atomically. The mutation is
|
||||
/// performed in a compare-and-swap retry loop rather than via the
|
||||
/// <c>AddOrUpdate</c> update delegate so the sequence-number guard and the field
|
||||
/// writes are evaluated as a single atomic step against the value actually
|
||||
/// installed — the <c>AddOrUpdate</c> delegate may be invoked more than once
|
||||
/// under contention and could otherwise act on a value that is then discarded.
|
||||
/// </remarks>
|
||||
public void ProcessReport(SiteHealthReport report)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
_siteStates.AddOrUpdate(
|
||||
report.SiteId,
|
||||
_ =>
|
||||
while (true)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(report.SiteId, out var existing))
|
||||
{
|
||||
_logger.LogInformation("Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
||||
return new SiteHealthState
|
||||
var registered = new SiteHealthState
|
||||
{
|
||||
SiteId = report.SiteId,
|
||||
LatestReport = report,
|
||||
@@ -51,50 +59,84 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
LastSequenceNumber = report.SequenceNumber,
|
||||
IsOnline = true
|
||||
};
|
||||
},
|
||||
(_, existing) =>
|
||||
|
||||
if (_siteStates.TryAdd(report.SiteId, registered))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} registered with sequence #{Seq}", report.SiteId, report.SequenceNumber);
|
||||
return;
|
||||
}
|
||||
|
||||
// Lost the race — another thread registered first; retry as an update.
|
||||
continue;
|
||||
}
|
||||
|
||||
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
||||
{
|
||||
if (report.SequenceNumber <= existing.LastSequenceNumber)
|
||||
_logger.LogDebug(
|
||||
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
||||
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
||||
return;
|
||||
}
|
||||
|
||||
var updated = existing with
|
||||
{
|
||||
LatestReport = report,
|
||||
LastReportReceivedAt = now,
|
||||
LastHeartbeatAt = now,
|
||||
LastSequenceNumber = report.SequenceNumber,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
|
||||
{
|
||||
if (!existing.IsOnline)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Rejecting stale report from site {SiteId}: seq {Incoming} <= {Last}",
|
||||
report.SiteId, report.SequenceNumber, existing.LastSequenceNumber);
|
||||
return existing;
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
var wasOffline = !existing.IsOnline;
|
||||
existing.LatestReport = report;
|
||||
existing.LastReportReceivedAt = now;
|
||||
existing.LastHeartbeatAt = now;
|
||||
existing.LastSequenceNumber = report.SequenceNumber;
|
||||
existing.IsOnline = true;
|
||||
|
||||
if (wasOffline)
|
||||
{
|
||||
_logger.LogInformation("Site {SiteId} is back online (seq #{Seq})", report.SiteId, report.SequenceNumber);
|
||||
}
|
||||
|
||||
return existing;
|
||||
});
|
||||
// CAS lost — the entry changed under us; retry with the fresh value.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
||||
/// full report from are ignored — registration only happens on report.
|
||||
/// The update is an atomic compare-and-swap of the immutable state.
|
||||
/// </summary>
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(siteId, out var state))
|
||||
return;
|
||||
|
||||
if (receivedAt > state.LastHeartbeatAt)
|
||||
state.LastHeartbeatAt = receivedAt;
|
||||
|
||||
if (!state.IsOnline)
|
||||
while (true)
|
||||
{
|
||||
state.IsOnline = true;
|
||||
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
||||
if (!_siteStates.TryGetValue(siteId, out var existing))
|
||||
return;
|
||||
|
||||
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
|
||||
? receivedAt
|
||||
: existing.LastHeartbeatAt;
|
||||
|
||||
// Nothing to change — avoid a needless swap.
|
||||
if (newHeartbeat == existing.LastHeartbeatAt && existing.IsOnline)
|
||||
return;
|
||||
|
||||
var updated = existing with
|
||||
{
|
||||
LastHeartbeatAt = newHeartbeat,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryUpdate(siteId, updated, existing))
|
||||
{
|
||||
if (!existing.IsOnline)
|
||||
_logger.LogInformation("Site {SiteId} is back online (heartbeat)", siteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// CAS lost — retry with the fresh value.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,13 +185,20 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
var state = kvp.Value;
|
||||
if (!state.IsOnline) continue;
|
||||
|
||||
// Use LastHeartbeatAt — heartbeats arrive every ~5s from any
|
||||
// healthy site node, so OfflineTimeout only fires when no node
|
||||
// can reach central, not during single-node failovers.
|
||||
// Use LastHeartbeatAt — heartbeats arrive frequently from any
|
||||
// healthy site node (cadence owned by Cluster Infrastructure /
|
||||
// SiteCommunicationActor), so OfflineTimeout only fires when no
|
||||
// node can reach central, not during single-node failovers.
|
||||
var elapsed = now - state.LastHeartbeatAt;
|
||||
if (elapsed > _options.OfflineTimeout)
|
||||
if (elapsed <= _options.OfflineTimeout)
|
||||
continue;
|
||||
|
||||
// Atomically swap to an offline copy. If the CAS loses to a
|
||||
// concurrent report/heartbeat the site was just heard from, so
|
||||
// leaving it online is the correct outcome — no retry needed.
|
||||
var offline = state with { IsOnline = false };
|
||||
if (_siteStates.TryUpdate(kvp.Key, offline, state))
|
||||
{
|
||||
state.IsOnline = false;
|
||||
_logger.LogWarning(
|
||||
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
||||
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
||||
|
||||
@@ -84,6 +84,20 @@ public class HealthReportSender : BackgroundService
|
||||
_collector.SetParkedMessageCount(parkedCount);
|
||||
}
|
||||
catch { /* Non-fatal — parked count will be 0 */ }
|
||||
|
||||
try
|
||||
{
|
||||
// Per-category pending-message buffer depths (the documented
|
||||
// "store-and-forward buffer depth" triage metric). Keyed by
|
||||
// StoreAndForwardCategory name so the central dashboard can
|
||||
// render external/notification/DB-write depths separately.
|
||||
var depthsByCategory = await _sfStorage.GetBufferDepthByCategoryAsync();
|
||||
var depths = depthsByCategory.ToDictionary(
|
||||
kvp => kvp.Key.ToString(),
|
||||
kvp => kvp.Value);
|
||||
_collector.SetStoreAndForwardDepths(depths);
|
||||
}
|
||||
catch { /* Non-fatal — buffer depths will be empty */ }
|
||||
}
|
||||
|
||||
var seq = Interlocked.Increment(ref _sequenceNumber);
|
||||
|
||||
@@ -4,26 +4,37 @@ namespace ScadaLink.HealthMonitoring;
|
||||
|
||||
/// <summary>
|
||||
/// In-memory state for a single site's health, stored by the central aggregator.
|
||||
/// Immutable: every state transition produces a new instance which the aggregator
|
||||
/// installs into its <c>ConcurrentDictionary</c> via an atomic compare-and-swap.
|
||||
/// This makes handing the reference straight to UI callers safe — a consumer can
|
||||
/// never observe a torn or half-applied update.
|
||||
/// </summary>
|
||||
public class SiteHealthState
|
||||
public sealed record SiteHealthState
|
||||
{
|
||||
public required string SiteId { get; init; }
|
||||
public SiteHealthReport LatestReport { get; set; } = null!;
|
||||
|
||||
/// <summary>
|
||||
/// The latest full <see cref="SiteHealthReport"/> received for the site, or
|
||||
/// <c>null</c> if the site is known only via heartbeats and has not yet sent
|
||||
/// a report.
|
||||
/// </summary>
|
||||
public SiteHealthReport? LatestReport { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time the latest full <see cref="SiteHealthReport"/> was processed.
|
||||
/// Used by the UI to surface report staleness during failover.
|
||||
/// </summary>
|
||||
public DateTimeOffset LastReportReceivedAt { get; set; }
|
||||
public DateTimeOffset LastReportReceivedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time the most recent signal of any kind (full report OR ~5s heartbeat)
|
||||
/// was received. Drives offline detection — heartbeats from the standby
|
||||
/// keep the site marked online even when the active node is unable to
|
||||
/// produce a report (mid-failover, brief stalls).
|
||||
/// Time the most recent signal of any kind (full report OR heartbeat) was
|
||||
/// received. Drives offline detection — heartbeats from the standby keep the
|
||||
/// site marked online even when the active node is unable to produce a report
|
||||
/// (mid-failover, brief stalls). See the heartbeat scheduler owned by the
|
||||
/// Cluster Infrastructure / SiteCommunicationActor for the actual cadence.
|
||||
/// </summary>
|
||||
public DateTimeOffset LastHeartbeatAt { get; set; }
|
||||
public DateTimeOffset LastHeartbeatAt { get; init; }
|
||||
|
||||
public long LastSequenceNumber { get; set; }
|
||||
public bool IsOnline { get; set; }
|
||||
public long LastSequenceNumber { get; init; }
|
||||
public bool IsOnline { get; init; }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user