fix(siteeventlog): suppress snapshot-resync alarm re-emit + coverage + hardening (review)

This commit is contained in:
Joseph Doherty
2026-06-15 12:45:00 -04:00
parent e74c3aef23
commit e5534fddca
6 changed files with 166 additions and 23 deletions
@@ -41,6 +41,17 @@ public class AlarmActor : ReceiveActor
private readonly ISiteHealthCollector? _healthCollector;
private readonly IServiceProvider? _serviceProvider;
/// <summary>
/// M1.5: the optional site operational-event log, resolved once from
/// <see cref="_serviceProvider"/> at construction and cached. The
/// registration is process-lifetime (a singleton), so resolving once on
/// the actor's own thread is both correct and cheaper than a per-event
/// <c>GetService</c> on the hot path. <c>null</c> when no provider was
/// supplied (the test/no-logging path) — <see cref="LogAlarmEvent"/> then
/// no-ops.
/// </summary>
private readonly ISiteEventLogger? _siteEventLogger;
/// <summary>
/// M1.5: priority at or above which a computed-alarm raise is logged as
/// <c>Error</c> to the site event log; below it, raises log as <c>Warning</c>.
@@ -118,6 +129,9 @@ public class AlarmActor : ReceiveActor
_logger = logger;
_healthCollector = healthCollector;
_serviceProvider = serviceProvider;
// M1.5: resolve the optional site event logger once and cache it,
// rather than calling GetService on every alarm transition.
_siteEventLogger = serviceProvider?.GetService<ISiteEventLogger>();
_priority = alarmConfig.PriorityLevel;
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
_onTriggerCompiledScript = onTriggerCompiledScript;
@@ -323,13 +337,14 @@ public class AlarmActor : ReceiveActor
/// <summary>
/// M1.5: fire-and-forget an <c>alarm</c> operational event to the optional
/// <see cref="ISiteEventLogger"/>. Resolved optionally and never awaited so a
/// logging failure cannot affect alarm evaluation (matching the established
/// <see cref="ISiteEventLogger"/> (resolved once at construction and cached
/// in <see cref="_siteEventLogger"/>). Never awaited so a logging failure
/// cannot affect alarm evaluation (matching the established
/// ScriptActor/ScriptExecutionActor pattern).
/// </summary>
private void LogAlarmEvent(string severity, string message)
{
_ = _serviceProvider?.GetService<ISiteEventLogger>()?.LogEventAsync(
_ = _siteEventLogger?.LogEventAsync(
"alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message);
}
@@ -643,9 +643,18 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
/// <see cref="ISiteEventLogger"/> on a deploy/enable/disable/delete outcome.
/// Resolved optionally and never awaited so a logging failure cannot affect the
/// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor
/// pattern). Only reads the immutable <c>_serviceProvider</c> field, so it is
/// safe to call from the PipeTo continuations that report disable/delete
/// outcomes off the actor thread.
/// pattern).
/// <para>
/// <b>Thread-safety:</b> the disable (<see cref="HandleDisable"/>) and delete
/// (<see cref="HandleDelete"/>) paths call this from a
/// <see cref="System.Threading.Tasks.Task.ContinueWith(System.Action{System.Threading.Tasks.Task})"/>
/// continuation that runs on a thread-pool thread, NOT on the actor thread —
/// so it must touch only immutable, thread-safe state. It does: the only
/// field it reads is the <c>readonly _serviceProvider</c> captured at
/// construction (the resolved <see cref="ISiteEventLogger"/> is a process
/// singleton). No actor-private mutable state is referenced, which is what
/// makes calling it off the actor thread safe.
/// </para>
/// </summary>
private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null)
{
@@ -212,7 +212,14 @@ public class NativeAlarmActor : ReceiveActor
{
_alarms[sourceRef] = t;
PersistUpsert(t);
Emit(t, t.Condition);
// M1.5: a snapshot replay is a re-sync of the source's current
// active set on (re)subscribe, NOT a live transition — surface it
// upward for the DebugView but do NOT re-log an `alarm` operational
// event. Otherwise every DCL reconnect would re-emit an `alarm`
// event for every already-active native condition (the
// synthesised return-to-normal above IS a real state change and
// keeps logSiteEvent: true).
Emit(t, t.Condition, logSiteEvent: false);
}
_snapshotBuffer.Clear();