fix(core-scripted-alarms): resolve Low code-review findings (Core.ScriptedAlarms-003,006,008,010,011; -009 documented)
- Core.ScriptedAlarms-003: emit OnEvent OUTSIDE _evalGate by collecting
pending emissions during the gate-held section and flushing them after
release; eliminates re-entrancy deadlock the docs already promised.
- Core.ScriptedAlarms-006: track every fire-and-forget Reevaluate /
ShelvingCheck task in _inFlight; Dispose drains the set so the engine
no longer races store writes against teardown.
- Core.ScriptedAlarms-008: store comments as ImmutableList<AlarmComment>
so AppendComment is O(log n) instead of O(n).
- Core.ScriptedAlarms-010: document the deliberate input-quality
asymmetry (Uncertain drives the predicate, renders {?} in the message)
in docs/ScriptedAlarms.md and on MessageTemplate.Resolve remarks.
- Core.ScriptedAlarms-011: propagate the no-op reason through
TransitionResult.NoOp(state, reason) and log it from
ScriptedAlarmEngine.ApplyAsync.
- Core.ScriptedAlarms-009 (Won't Fix per recommendation): documented the
per-evaluation dictionary allocation in docs/v2/Galaxy.Performance.md
with a mitigation path if a future soak surfaces pressure.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -606,6 +606,253 @@ public sealed class ScriptedAlarmEngineTests
|
||||
"Uncertain-quality inputs are treated as ready — predicate evaluates");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Core.ScriptedAlarms-003: OnEvent emission must not block under _evalGate.
|
||||
// (1) A slow subscriber must not block the gate for other alarms.
|
||||
// (2) A subscriber that re-enters the engine (e.g. AcknowledgeAsync) must
|
||||
// not deadlock against _evalGate. Both regressions are covered here.
|
||||
// -------------------------------------------------------------------------
|
||||
[Fact]
|
||||
public async Task OnEvent_subscriber_can_call_back_into_engine_without_deadlock(/* -003 */)
|
||||
{
|
||||
// Re-entrancy regression. When OnEvent emission was inside _evalGate, a
|
||||
// subscriber that called an engine method (e.g. AcknowledgeAsync) hung
|
||||
// forever because the non-reentrant SemaphoreSlim refused to re-grant
|
||||
// the gate the dispatch path was still holding. After the fix, emission
|
||||
// happens AFTER Release() so the subscriber's call acquires the gate
|
||||
// cleanly and the operator-driven action completes.
|
||||
var up = new FakeUpstream();
|
||||
up.Set("Temp", 50);
|
||||
var eng = Build(up, out _);
|
||||
try
|
||||
{
|
||||
await eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")],
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
// Subscriber re-enters the engine via Task.Run so the OnEvent
|
||||
// dispatch thread is not blocked while waiting. Either way, with
|
||||
// the fix in place AcknowledgeAsync must acquire _evalGate (the
|
||||
// dispatch path released it before invoking the subscriber) and
|
||||
// complete in well under the timeout.
|
||||
var ackDone = new TaskCompletionSource();
|
||||
eng.OnEvent += (_, e) =>
|
||||
{
|
||||
if (e.Emission != EmissionKind.Activated) return;
|
||||
_ = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await eng.AcknowledgeAsync(e.AlarmId, "sub", null, CancellationToken.None);
|
||||
ackDone.TrySetResult();
|
||||
}
|
||||
catch (Exception ex) { ackDone.TrySetException(ex); }
|
||||
});
|
||||
};
|
||||
|
||||
up.Push("Temp", 150);
|
||||
|
||||
var winner = await Task.WhenAny(ackDone.Task, Task.Delay(TimeSpan.FromSeconds(3)));
|
||||
winner.ShouldBe(ackDone.Task,
|
||||
"subscriber re-entering the engine must not deadlock against _evalGate");
|
||||
await ackDone.Task; // surface any inner exception
|
||||
eng.GetState("HighTemp")!.Acked.ShouldBe(AlarmAckedState.Acknowledged);
|
||||
}
|
||||
finally
|
||||
{
|
||||
eng.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OnEvent_emission_happens_outside_evalGate(/* -003 */)
|
||||
{
|
||||
// Direct white-box check on the gate-release ordering: AcknowledgeAsync
|
||||
// emits the Acknowledged event AFTER releasing the gate. We assert that
|
||||
// by observing the gate is acquirable from inside the subscriber.
|
||||
// SemaphoreSlim.Wait(0) returns true only if the count > 0 (gate free).
|
||||
var up = new FakeUpstream();
|
||||
up.Set("Temp", 50);
|
||||
var eng = Build(up, out _);
|
||||
try
|
||||
{
|
||||
eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")],
|
||||
TestContext.Current.CancellationToken).GetAwaiter().GetResult();
|
||||
// Drive to Active so Acknowledge has something to ack.
|
||||
up.Push("Temp", 150);
|
||||
// Use the same WaitForAsync that other tests use — synchronously
|
||||
// here since this is a non-async test.
|
||||
for (var i = 0; i < 80 && eng.GetState("HighTemp")!.Active != AlarmActiveState.Active; i++)
|
||||
Thread.Sleep(25);
|
||||
eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active);
|
||||
|
||||
// Use reflection to peek at _evalGate so the subscriber can probe it.
|
||||
var gateField = typeof(ScriptedAlarmEngine).GetField(
|
||||
"_evalGate", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
|
||||
gateField.ShouldNotBeNull();
|
||||
var gate = (SemaphoreSlim)gateField.GetValue(eng)!;
|
||||
|
||||
var gateFreeInsideEmission = false;
|
||||
eng.OnEvent += (_, e) =>
|
||||
{
|
||||
if (e.Emission != EmissionKind.Acknowledged) return;
|
||||
// SemaphoreSlim.Wait(0) — non-blocking try-take. If the gate is
|
||||
// free we acquire it (count back to 0); release immediately.
|
||||
if (gate.Wait(0))
|
||||
{
|
||||
gateFreeInsideEmission = true;
|
||||
gate.Release();
|
||||
}
|
||||
};
|
||||
|
||||
eng.AcknowledgeAsync("HighTemp", "alice", null, CancellationToken.None)
|
||||
.GetAwaiter().GetResult();
|
||||
|
||||
gateFreeInsideEmission.ShouldBeTrue(
|
||||
"_evalGate must be released before OnEvent fires so subscribers " +
|
||||
"can call back into the engine without deadlocking");
|
||||
}
|
||||
finally
|
||||
{
|
||||
eng.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Core.ScriptedAlarms-006: Dispose must drain in-flight background tasks
|
||||
// launched by OnUpstreamChange / RunShelvingCheck. Otherwise a re-evaluation
|
||||
// or shelving check started just before Dispose can keep running and write
|
||||
// to a (possibly disposed) store after the engine has returned.
|
||||
// -------------------------------------------------------------------------
|
||||
[Fact]
|
||||
public async Task Dispose_drains_in_flight_reevaluation_tasks(/* -006 */)
|
||||
{
|
||||
var up = new FakeUpstream();
|
||||
up.Set("Temp", 50);
|
||||
var logger = new LoggerConfiguration().CreateLogger();
|
||||
var slowStore = new BlockingSaveAlarmStateStore();
|
||||
var eng = new ScriptedAlarmEngine(up, slowStore, new ScriptLoggerFactory(logger), logger);
|
||||
await eng.LoadAsync([Alarm("A", """return (int)ctx.GetTag("Temp").Value > 100;""")],
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
// Block the NEXT save (the one triggered by the push below).
|
||||
var saveGate = new TaskCompletionSource();
|
||||
slowStore.BlockNextSave = saveGate;
|
||||
|
||||
// Trigger a re-evaluation that will go inside _evalGate and call SaveAsync.
|
||||
up.Push("Temp", 150);
|
||||
|
||||
// Wait until the store's SaveAsync is actually blocked.
|
||||
await WaitForAsync(() => slowStore.SaveInProgress, timeoutMs: 1000);
|
||||
|
||||
// Dispose must wait for the in-flight reevaluation to complete rather
|
||||
// than returning while a background task still runs.
|
||||
var disposeTask = Task.Run(() => eng.Dispose());
|
||||
|
||||
// Verify Dispose does NOT complete immediately — it should block waiting
|
||||
// for the in-flight task. Without the -006 fix Dispose returns straight
|
||||
// away and the background reevaluation can outlive the engine.
|
||||
var prematureFinish = await Task.WhenAny(disposeTask, Task.Delay(200));
|
||||
prematureFinish.ShouldNotBe(disposeTask,
|
||||
"Dispose must block until in-flight background tasks complete");
|
||||
|
||||
// Let the save complete and verify Dispose then returns.
|
||||
saveGate.SetResult();
|
||||
await disposeTask.WaitAsync(TimeSpan.FromSeconds(3), TestContext.Current.CancellationToken);
|
||||
slowStore.SaveInProgress.ShouldBeFalse("background task drained before Dispose returned");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Core.ScriptedAlarms-010: predicate evaluation and message-template
|
||||
// resolution apply different quality bars on purpose. Predicate evaluation
|
||||
// accepts Uncertain (the predicate can still inspect the value); message
|
||||
// resolution renders Uncertain as "{?}" so the operator sees the doubt
|
||||
// explicitly. The two policies are documented in docs/ScriptedAlarms.md.
|
||||
// -------------------------------------------------------------------------
|
||||
[Fact]
|
||||
public async Task Uncertain_quality_drives_predicate_but_renders_question_mark_in_message(/* -010 */)
|
||||
{
|
||||
var up = new FakeUpstream();
|
||||
// Seed with Uncertain quality (severity bit 30 set, bit 31 clear).
|
||||
up.Set("Temp", 150, statusCode: 0x40000000u);
|
||||
using var eng = Build(up, out _);
|
||||
await eng.LoadAsync([
|
||||
new ScriptedAlarmDefinition(
|
||||
"HighTemp", "Plant/Line1", "HighTemp",
|
||||
AlarmKind.LimitAlarm, AlarmSeverity.High,
|
||||
"Temp {Temp} exceeded limit",
|
||||
"""return (int)ctx.GetTag("Temp").Value > 100;"""),
|
||||
], TestContext.Current.CancellationToken);
|
||||
|
||||
// Predicate evaluated (Uncertain treated as ready) → alarm Active.
|
||||
eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active,
|
||||
"AreInputsReady accepts Uncertain so the predicate runs");
|
||||
|
||||
// But the resolved emission message must show "{?}" for the Uncertain
|
||||
// tag — only Good substitutes into the operator-facing message.
|
||||
var events = new List<ScriptedAlarmEvent>();
|
||||
eng.OnEvent += (_, e) => events.Add(e);
|
||||
up.Push("Temp", 200, statusCode: 0x40000000u); // still Uncertain
|
||||
// Trigger another evaluation to get an emission (already active, so
|
||||
// we need a clear → re-activate cycle). Easier: force the same path
|
||||
// through a comment which emits a CommentAdded message. But comments
|
||||
// don't run the template. Instead clear it then re-activate.
|
||||
up.Push("Temp", 50, statusCode: 0u); // Good, predicate becomes false
|
||||
await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Cleared));
|
||||
events.Clear();
|
||||
up.Push("Temp", 200, statusCode: 0x40000000u); // Uncertain, predicate true
|
||||
await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Activated));
|
||||
|
||||
// The Activated message must show {?} for the Uncertain input.
|
||||
events.Single(e => e.Emission == EmissionKind.Activated).Message
|
||||
.ShouldBe("Temp {?} exceeded limit",
|
||||
"MessageTemplate.Resolve renders non-Good StatusCode as {?} " +
|
||||
"even though predicate evaluation accepted the Uncertain value");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Core.ScriptedAlarms-008: switch Comments to ImmutableList for O(log n)
|
||||
// append. The persisted runtime type must be ImmutableList<AlarmComment>
|
||||
// (which still satisfies IReadOnlyList<AlarmComment> for existing
|
||||
// consumers).
|
||||
// -------------------------------------------------------------------------
|
||||
[Fact]
|
||||
public async Task Comments_collection_uses_ImmutableList_for_efficient_append(/* -008 */)
|
||||
{
|
||||
var up = new FakeUpstream();
|
||||
up.Set("Temp", 50);
|
||||
using var eng = Build(up, out _);
|
||||
await eng.LoadAsync([Alarm("A", "return false;")], TestContext.Current.CancellationToken);
|
||||
|
||||
// Add a comment so AppendComment runs.
|
||||
await eng.AddCommentAsync("A", "alice", "note", TestContext.Current.CancellationToken);
|
||||
|
||||
var s = eng.GetState("A")!;
|
||||
s.Comments.ShouldBeOfType<System.Collections.Immutable.ImmutableList<AlarmComment>>(
|
||||
"Comments should be an ImmutableList so append is O(log n), not O(n)");
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Core.ScriptedAlarms-011: TransitionResult.NoOp's reason parameter must be
|
||||
// propagated, not silently discarded. The class-level remarks promise a
|
||||
// diagnostic log line for no-op disabled-alarm evaluations.
|
||||
// -------------------------------------------------------------------------
|
||||
[Fact]
|
||||
public void TransitionResult_NoOp_propagates_reason(/* -011 */)
|
||||
{
|
||||
var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow);
|
||||
var r = TransitionResult.NoOp(fresh, "disabled — predicate result ignored");
|
||||
r.NoOpReason.ShouldBe("disabled — predicate result ignored",
|
||||
"NoOp reason must be preserved on the TransitionResult so callers can log it");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TransitionResult_None_carries_no_reason(/* -011 */)
|
||||
{
|
||||
var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow);
|
||||
var r = TransitionResult.None(fresh);
|
||||
r.NoOpReason.ShouldBeNull("None() factory has no reason — only NoOp() carries one");
|
||||
}
|
||||
|
||||
private static async Task WaitForAsync(Func<bool> cond, int timeoutMs = 2000)
|
||||
{
|
||||
var deadline = DateTime.UtcNow.AddMilliseconds(timeoutMs);
|
||||
@@ -645,4 +892,37 @@ public sealed class ScriptedAlarmEngineTests
|
||||
public Task RemoveAsync(string alarmId, CancellationToken ct)
|
||||
=> _inner.RemoveAsync(alarmId, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A store whose SaveAsync can be made to block until the test signals it.
|
||||
/// Used to verify Dispose drains in-flight background tasks (finding -006).
|
||||
/// </summary>
|
||||
private sealed class BlockingSaveAlarmStateStore : IAlarmStateStore
|
||||
{
|
||||
private readonly InMemoryAlarmStateStore _inner = new();
|
||||
public TaskCompletionSource? BlockNextSave { get; set; }
|
||||
public bool SaveInProgress { get; private set; }
|
||||
|
||||
public Task<AlarmConditionState?> LoadAsync(string alarmId, CancellationToken ct)
|
||||
=> _inner.LoadAsync(alarmId, ct);
|
||||
|
||||
public Task<IReadOnlyList<AlarmConditionState>> LoadAllAsync(CancellationToken ct)
|
||||
=> _inner.LoadAllAsync(ct);
|
||||
|
||||
public async Task SaveAsync(AlarmConditionState state, CancellationToken ct)
|
||||
{
|
||||
var gate = BlockNextSave;
|
||||
if (gate is not null)
|
||||
{
|
||||
BlockNextSave = null;
|
||||
SaveInProgress = true;
|
||||
try { await gate.Task.WaitAsync(ct).ConfigureAwait(false); }
|
||||
finally { SaveInProgress = false; }
|
||||
}
|
||||
await _inner.SaveAsync(state, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public Task RemoveAsync(string alarmId, CancellationToken ct)
|
||||
=> _inner.RemoveAsync(alarmId, ct);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user