fix(siteeventlog): suppress snapshot-resync alarm re-emit + coverage + hardening (review)
This commit is contained in:
@@ -41,6 +41,17 @@ public class AlarmActor : ReceiveActor
|
|||||||
private readonly ISiteHealthCollector? _healthCollector;
|
private readonly ISiteHealthCollector? _healthCollector;
|
||||||
private readonly IServiceProvider? _serviceProvider;
|
private readonly IServiceProvider? _serviceProvider;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// M1.5: the optional site operational-event log, resolved once from
|
||||||
|
/// <see cref="_serviceProvider"/> at construction and cached. The
|
||||||
|
/// registration is process-lifetime (a singleton), so resolving once on
|
||||||
|
/// the actor's own thread is both correct and cheaper than a per-event
|
||||||
|
/// <c>GetService</c> on the hot path. <c>null</c> when no provider was
|
||||||
|
/// supplied (the test/no-logging path) — <see cref="LogAlarmEvent"/> then
|
||||||
|
/// no-ops.
|
||||||
|
/// </summary>
|
||||||
|
private readonly ISiteEventLogger? _siteEventLogger;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// M1.5: priority at or above which a computed-alarm raise is logged as
|
/// M1.5: priority at or above which a computed-alarm raise is logged as
|
||||||
/// <c>Error</c> to the site event log; below it, raises log as <c>Warning</c>.
|
/// <c>Error</c> to the site event log; below it, raises log as <c>Warning</c>.
|
||||||
@@ -118,6 +129,9 @@ public class AlarmActor : ReceiveActor
|
|||||||
_logger = logger;
|
_logger = logger;
|
||||||
_healthCollector = healthCollector;
|
_healthCollector = healthCollector;
|
||||||
_serviceProvider = serviceProvider;
|
_serviceProvider = serviceProvider;
|
||||||
|
// M1.5: resolve the optional site event logger once and cache it,
|
||||||
|
// rather than calling GetService on every alarm transition.
|
||||||
|
_siteEventLogger = serviceProvider?.GetService<ISiteEventLogger>();
|
||||||
_priority = alarmConfig.PriorityLevel;
|
_priority = alarmConfig.PriorityLevel;
|
||||||
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
|
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
|
||||||
_onTriggerCompiledScript = onTriggerCompiledScript;
|
_onTriggerCompiledScript = onTriggerCompiledScript;
|
||||||
@@ -323,13 +337,14 @@ public class AlarmActor : ReceiveActor
|
|||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// M1.5: fire-and-forget an <c>alarm</c> operational event to the optional
|
/// M1.5: fire-and-forget an <c>alarm</c> operational event to the optional
|
||||||
/// <see cref="ISiteEventLogger"/>. Resolved optionally and never awaited so a
|
/// <see cref="ISiteEventLogger"/> (resolved once at construction and cached
|
||||||
/// logging failure cannot affect alarm evaluation (matching the established
|
/// in <see cref="_siteEventLogger"/>). Never awaited so a logging failure
|
||||||
|
/// cannot affect alarm evaluation (matching the established
|
||||||
/// ScriptActor/ScriptExecutionActor pattern).
|
/// ScriptActor/ScriptExecutionActor pattern).
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private void LogAlarmEvent(string severity, string message)
|
private void LogAlarmEvent(string severity, string message)
|
||||||
{
|
{
|
||||||
_ = _serviceProvider?.GetService<ISiteEventLogger>()?.LogEventAsync(
|
_ = _siteEventLogger?.LogEventAsync(
|
||||||
"alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message);
|
"alarm", severity, _instanceName, $"AlarmActor:{_alarmName}", message);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -643,9 +643,18 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
|||||||
/// <see cref="ISiteEventLogger"/> on a deploy/enable/disable/delete outcome.
|
/// <see cref="ISiteEventLogger"/> on a deploy/enable/disable/delete outcome.
|
||||||
/// Resolved optionally and never awaited so a logging failure cannot affect the
|
/// Resolved optionally and never awaited so a logging failure cannot affect the
|
||||||
/// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor
|
/// deployment pipeline (matching the established ScriptActor/ScriptExecutionActor
|
||||||
/// pattern). Only reads the immutable <c>_serviceProvider</c> field, so it is
|
/// pattern).
|
||||||
/// safe to call from the PipeTo continuations that report disable/delete
|
/// <para>
|
||||||
/// outcomes off the actor thread.
|
/// <b>Thread-safety:</b> the disable (<see cref="HandleDisable"/>) and delete
|
||||||
|
/// (<see cref="HandleDelete"/>) paths call this from a
|
||||||
|
/// <see cref="System.Threading.Tasks.Task.ContinueWith(System.Action{System.Threading.Tasks.Task})"/>
|
||||||
|
/// continuation that runs on a thread-pool thread, NOT on the actor thread —
|
||||||
|
/// so it must touch only immutable, thread-safe state. It does: the only
|
||||||
|
/// field it reads is the <c>readonly _serviceProvider</c> captured at
|
||||||
|
/// construction (the resolved <see cref="ISiteEventLogger"/> is a process
|
||||||
|
/// singleton). No actor-private mutable state is referenced, which is what
|
||||||
|
/// makes calling it off the actor thread safe.
|
||||||
|
/// </para>
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null)
|
private void LogDeploymentEvent(string severity, string instanceName, string message, string? details = null)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -212,7 +212,14 @@ public class NativeAlarmActor : ReceiveActor
|
|||||||
{
|
{
|
||||||
_alarms[sourceRef] = t;
|
_alarms[sourceRef] = t;
|
||||||
PersistUpsert(t);
|
PersistUpsert(t);
|
||||||
Emit(t, t.Condition);
|
// M1.5: a snapshot replay is a re-sync of the source's current
|
||||||
|
// active set on (re)subscribe, NOT a live transition — surface it
|
||||||
|
// upward for the DebugView but do NOT re-log an `alarm` operational
|
||||||
|
// event. Otherwise every DCL reconnect would re-emit an `alarm`
|
||||||
|
// event for every already-active native condition (the
|
||||||
|
// synthesised return-to-normal above IS a real state change and
|
||||||
|
// keeps logSiteEvent: true).
|
||||||
|
Emit(t, t.Condition, logSiteEvent: false);
|
||||||
}
|
}
|
||||||
|
|
||||||
_snapshotBuffer.Clear();
|
_snapshotBuffer.Clear();
|
||||||
|
|||||||
@@ -82,6 +82,18 @@ public class StoreAndForwardService
|
|||||||
/// recognisable instead of an unattributable empty string.
|
/// recognisable instead of an unattributable empty string.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public const string UnknownSiteSentinel = "$unknown-site";
|
public const string UnknownSiteSentinel = "$unknown-site";
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// M1.7: the detail-string prefix written by <see cref="EnqueueAsync"/>
|
||||||
|
/// when an immediate forward attempt throws and the message is buffered for
|
||||||
|
/// the retry sweep. <see cref="EmitSiteEvent"/> matches on this same prefix
|
||||||
|
/// to distinguish a forward <i>failure</i> (logged) from a routine
|
||||||
|
/// no-handler enqueue (not logged), so both the construction site and the
|
||||||
|
/// check reference this single constant rather than duplicating the
|
||||||
|
/// literal — keeping the two ends from drifting apart.
|
||||||
|
/// </summary>
|
||||||
|
private const string BufferedForRetryDetailPrefix = "Buffered for retry";
|
||||||
|
|
||||||
private Timer? _retryTimer;
|
private Timer? _retryTimer;
|
||||||
private int _retryInProgress;
|
private int _retryInProgress;
|
||||||
|
|
||||||
@@ -261,11 +273,12 @@ public class StoreAndForwardService
|
|||||||
if (category == StoreAndForwardCategory.Notification)
|
if (category == StoreAndForwardCategory.Notification)
|
||||||
{
|
{
|
||||||
// Spec: log only forward-failure (the immediate forward threw and the
|
// Spec: log only forward-failure (the immediate forward threw and the
|
||||||
// notification was buffered for retry — detail "Buffered for retry:")
|
// notification was buffered for retry — detail prefixed
|
||||||
// and park. A routine "No handler registered, buffered" enqueue and a
|
// BufferedForRetryDetailPrefix) and park. A routine "No handler
|
||||||
// forward-success "Delivered" are deliberately NOT logged.
|
// registered, buffered" enqueue and a forward-success "Delivered"
|
||||||
|
// are deliberately NOT logged.
|
||||||
var isForwardFailure = action == "Queued"
|
var isForwardFailure = action == "Queued"
|
||||||
&& detail.StartsWith("Buffered for retry", StringComparison.Ordinal);
|
&& detail.StartsWith(BufferedForRetryDetailPrefix, StringComparison.Ordinal);
|
||||||
if (!isForwardFailure && action != "Parked")
|
if (!isForwardFailure && action != "Parked")
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@@ -536,7 +549,7 @@ public class StoreAndForwardService
|
|||||||
message.LastError = ex.Message;
|
message.LastError = ex.Message;
|
||||||
await BufferAsync(message);
|
await BufferAsync(message);
|
||||||
|
|
||||||
RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})");
|
RaiseActivity("Queued", category, $"{BufferedForRetryDetailPrefix}: {target} ({ex.Message})");
|
||||||
return new StoreAndForwardResult(true, message.Id, true);
|
return new StoreAndForwardResult(true, message.Id, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -553,7 +566,7 @@ public class StoreAndForwardService
|
|||||||
await BufferAsync(message);
|
await BufferAsync(message);
|
||||||
RaiseActivity("Queued", category, attemptImmediateDelivery
|
RaiseActivity("Queued", category, attemptImmediateDelivery
|
||||||
? $"No handler registered, buffered: {target}"
|
? $"No handler registered, buffered: {target}"
|
||||||
: $"Buffered for retry: {target}");
|
: $"{BufferedForRetryDetailPrefix}: {target}");
|
||||||
return new StoreAndForwardResult(true, message.Id, true);
|
return new StoreAndForwardResult(true, message.Id, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+13
-10
@@ -219,24 +219,27 @@ public class DeploymentManagerActorTests : TestKit, IDisposable
|
|||||||
ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
ExpectMsg<DeploymentStatusResponse>(TimeSpan.FromSeconds(5));
|
||||||
await Task.Delay(1000);
|
await Task.Delay(1000);
|
||||||
|
|
||||||
|
// The deployment site events are emitted fire-and-forget off the actor
|
||||||
|
// thread (LogDeploymentEvent runs in a ContinueWith continuation), so
|
||||||
|
// poll for each event with AwaitAssert rather than a bare Task.Delay —
|
||||||
|
// a fixed sleep is racy under CI load.
|
||||||
actor.Tell(new DisableInstanceCommand("cmd-de1", "EvtPump", DateTimeOffset.UtcNow));
|
actor.Tell(new DisableInstanceCommand("cmd-de1", "EvtPump", DateTimeOffset.UtcNow));
|
||||||
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
||||||
await Task.Delay(300);
|
AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"),
|
||||||
|
r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase)),
|
||||||
|
TimeSpan.FromSeconds(2));
|
||||||
|
|
||||||
actor.Tell(new EnableInstanceCommand("cmd-en1", "EvtPump", DateTimeOffset.UtcNow));
|
actor.Tell(new EnableInstanceCommand("cmd-en1", "EvtPump", DateTimeOffset.UtcNow));
|
||||||
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
||||||
await Task.Delay(300);
|
AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"),
|
||||||
|
r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase)),
|
||||||
|
TimeSpan.FromSeconds(2));
|
||||||
|
|
||||||
actor.Tell(new DeleteInstanceCommand("cmd-del-evt", "EvtPump", DateTimeOffset.UtcNow));
|
actor.Tell(new DeleteInstanceCommand("cmd-del-evt", "EvtPump", DateTimeOffset.UtcNow));
|
||||||
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
Assert.True(ExpectMsg<InstanceLifecycleResponse>(TimeSpan.FromSeconds(5)).Success);
|
||||||
|
AwaitAssert(() => Assert.Contains(siteLog.OfType("deployment"),
|
||||||
AwaitAssert(() =>
|
r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase)),
|
||||||
{
|
TimeSpan.FromSeconds(2));
|
||||||
var rows = siteLog.OfType("deployment");
|
|
||||||
Assert.Contains(rows, r => r.Message.Contains("disabled", StringComparison.OrdinalIgnoreCase));
|
|
||||||
Assert.Contains(rows, r => r.Message.Contains("enabled", StringComparison.OrdinalIgnoreCase));
|
|
||||||
Assert.Contains(rows, r => r.Message.Contains("deleted", StringComparison.OrdinalIgnoreCase));
|
|
||||||
}, TimeSpan.FromSeconds(2));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
using System.Text.Json;
|
||||||
using Akka.Actor;
|
using Akka.Actor;
|
||||||
using Akka.TestKit.Xunit2;
|
using Akka.TestKit.Xunit2;
|
||||||
using Microsoft.Extensions.Logging.Abstractions;
|
using Microsoft.Extensions.Logging.Abstractions;
|
||||||
@@ -180,6 +181,101 @@ public class NativeAlarmActorTests : TestKit, IDisposable
|
|||||||
}, TimeSpan.FromSeconds(2));
|
}, TimeSpan.FromSeconds(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Rehydration_DoesNotEmitSiteEvent()
|
||||||
|
{
|
||||||
|
// Pre-populate SQLite with an active condition so the actor rehydrates
|
||||||
|
// it on PreStart. Rehydration replays last-known state — it is NOT a
|
||||||
|
// live transition, so it must surface upward (for the DebugView) but
|
||||||
|
// must NOT re-log an `alarm` operational event.
|
||||||
|
var condition = new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800);
|
||||||
|
await _storage.UpsertNativeAlarmAsync(
|
||||||
|
"inst", "Pressure", "T01.Hi",
|
||||||
|
JsonSerializer.Serialize(condition), DateTimeOffset.UtcNow);
|
||||||
|
|
||||||
|
var siteLog = new FakeSiteEventLogger();
|
||||||
|
var instance = CreateTestProbe();
|
||||||
|
var dcl = CreateTestProbe();
|
||||||
|
Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog));
|
||||||
|
|
||||||
|
// The rehydrated condition is surfaced upward...
|
||||||
|
var emitted = instance.ExpectMsg<AlarmStateChanged>(TimeSpan.FromSeconds(2));
|
||||||
|
Assert.Equal("T01.Hi", emitted.SourceReference);
|
||||||
|
Assert.Equal(AlarmState.Active, emitted.State);
|
||||||
|
dcl.ExpectMsg<SubscribeAlarmsRequest>();
|
||||||
|
|
||||||
|
// ...but no `alarm` operational event is logged for it.
|
||||||
|
AwaitAssert(
|
||||||
|
() => Assert.Empty(siteLog.OfType("alarm")),
|
||||||
|
TimeSpan.FromSeconds(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void SnapshotSwap_ExistingActiveCondition_DoesNotReEmit()
|
||||||
|
{
|
||||||
|
var siteLog = new FakeSiteEventLogger();
|
||||||
|
var instance = CreateTestProbe();
|
||||||
|
var dcl = CreateTestProbe();
|
||||||
|
var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog));
|
||||||
|
dcl.ExpectMsg<SubscribeAlarmsRequest>();
|
||||||
|
|
||||||
|
// Live raise — the one and only `alarm` event we expect.
|
||||||
|
actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition(
|
||||||
|
"T01.Hi", AlarmTransitionKind.Raise,
|
||||||
|
new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800))));
|
||||||
|
instance.ExpectMsg<AlarmStateChanged>(m => m.State == AlarmState.Active);
|
||||||
|
AwaitAssert(() => Assert.Single(siteLog.OfType("alarm")), TimeSpan.FromSeconds(2));
|
||||||
|
|
||||||
|
// A reconnect snapshot that RE-INCLUDES the same still-active condition is
|
||||||
|
// a re-sync, not a live transition. It must NOT re-log a second `alarm`
|
||||||
|
// event (regression for the spurious-reconnect-event bug).
|
||||||
|
actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition(
|
||||||
|
"T01.Hi", AlarmTransitionKind.Snapshot,
|
||||||
|
new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800))));
|
||||||
|
actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition(
|
||||||
|
"T01.Hi", AlarmTransitionKind.SnapshotComplete,
|
||||||
|
new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800))));
|
||||||
|
|
||||||
|
// The snapshot still surfaces the condition upward (DebugView re-sync)...
|
||||||
|
instance.ExpectMsg<AlarmStateChanged>(m => m.SourceReference == "T01.Hi" && m.State == AlarmState.Active);
|
||||||
|
|
||||||
|
// ...but the `alarm` event count stays at exactly 1 — no re-emit.
|
||||||
|
Thread.Sleep(200); // give any spurious fire-and-forget log time to land
|
||||||
|
Assert.Single(siteLog.OfType("alarm"));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Acknowledge_EmitsInfoAlarmSiteEventMentioningAcknowledged()
|
||||||
|
{
|
||||||
|
var siteLog = new FakeSiteEventLogger();
|
||||||
|
var instance = CreateTestProbe();
|
||||||
|
var dcl = CreateTestProbe();
|
||||||
|
var actor = Spawn(instance.Ref, dcl.Ref, new SingleServiceProvider(siteLog));
|
||||||
|
dcl.ExpectMsg<SubscribeAlarmsRequest>();
|
||||||
|
|
||||||
|
var t0 = DateTimeOffset.UtcNow;
|
||||||
|
actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition(
|
||||||
|
"T01.Hi", AlarmTransitionKind.Raise,
|
||||||
|
new AlarmConditionState(true, false, null, AlarmShelveState.Unshelved, false, 800), t0)));
|
||||||
|
instance.ExpectMsg<AlarmStateChanged>(m => m.State == AlarmState.Active);
|
||||||
|
|
||||||
|
// Operator acknowledges the still-active condition. The Acknowledge
|
||||||
|
// branch of LogAlarmEvent logs Info and mentions "acknowledged".
|
||||||
|
actor.Tell(new NativeAlarmTransitionUpdate("Opc", Transition(
|
||||||
|
"T01.Hi", AlarmTransitionKind.Acknowledge,
|
||||||
|
new AlarmConditionState(true, true, null, AlarmShelveState.Unshelved, false, 800), t0.AddSeconds(5))));
|
||||||
|
instance.ExpectMsg<AlarmStateChanged>();
|
||||||
|
|
||||||
|
AwaitAssert(() =>
|
||||||
|
{
|
||||||
|
var rows = siteLog.OfType("alarm");
|
||||||
|
Assert.Equal(2, rows.Count); // raise + acknowledge
|
||||||
|
var ack = rows[1];
|
||||||
|
Assert.Equal("Info", ack.Severity);
|
||||||
|
Assert.Contains("acknowledged", ack.Message, StringComparison.OrdinalIgnoreCase);
|
||||||
|
}, TimeSpan.FromSeconds(2));
|
||||||
|
}
|
||||||
|
|
||||||
void IDisposable.Dispose()
|
void IDisposable.Dispose()
|
||||||
{
|
{
|
||||||
Shutdown();
|
Shutdown();
|
||||||
|
|||||||
Reference in New Issue
Block a user