fix(core-scripted-alarms): resolve Low code-review findings (Core.ScriptedAlarms-003,006,008,010,011; -009 documented)

- Core.ScriptedAlarms-003: emit OnEvent OUTSIDE _evalGate by collecting
  pending emissions during the gate-held section and flushing them after
  release; eliminates re-entrancy deadlock the docs already promised.
- Core.ScriptedAlarms-006: track every fire-and-forget Reevaluate /
  ShelvingCheck task in _inFlight; Dispose drains the set so the engine
  no longer races store writes against teardown.
- Core.ScriptedAlarms-008: store comments as ImmutableList<AlarmComment>
  so AppendComment is O(log n) instead of O(n).
- Core.ScriptedAlarms-010: document the deliberate input-quality
  asymmetry (Uncertain drives the predicate, renders {?} in the message)
  in docs/ScriptedAlarms.md and on MessageTemplate.Resolve remarks.
- Core.ScriptedAlarms-011: propagate the no-op reason through
  TransitionResult.NoOp(state, reason) and log it from
  ScriptedAlarmEngine.ApplyAsync.
- Core.ScriptedAlarms-009 (Won't Fix per recommendation): documented the
  per-evaluation dictionary allocation in docs/v2/Galaxy.Performance.md
  with a mitigation path if a future soak surfaces pressure.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 07:23:31 -04:00
parent e74e8f7b31
commit 99354bfaf2
8 changed files with 491 additions and 42 deletions

View File

@@ -606,6 +606,253 @@ public sealed class ScriptedAlarmEngineTests
"Uncertain-quality inputs are treated as ready — predicate evaluates");
}
// -------------------------------------------------------------------------
// Core.ScriptedAlarms-003: OnEvent emission must not block under _evalGate.
// (1) A slow subscriber must not block the gate for other alarms.
// (2) A subscriber that re-enters the engine (e.g. AcknowledgeAsync) must
// not deadlock against _evalGate. Both regressions are covered here.
// -------------------------------------------------------------------------
[Fact]
public async Task OnEvent_subscriber_can_call_back_into_engine_without_deadlock(/* -003 */)
{
// Re-entrancy regression. When OnEvent emission was inside _evalGate, a
// subscriber that called an engine method (e.g. AcknowledgeAsync) hung
// forever because the non-reentrant SemaphoreSlim refused to re-grant
// the gate the dispatch path was still holding. After the fix, emission
// happens AFTER Release() so the subscriber's call acquires the gate
// cleanly and the operator-driven action completes.
var up = new FakeUpstream();
up.Set("Temp", 50);
var eng = Build(up, out _);
try
{
await eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")],
TestContext.Current.CancellationToken);
// Subscriber re-enters the engine via Task.Run so the OnEvent
// dispatch thread is not blocked while waiting. Either way, with
// the fix in place AcknowledgeAsync must acquire _evalGate (the
// dispatch path released it before invoking the subscriber) and
// complete in well under the timeout.
var ackDone = new TaskCompletionSource();
eng.OnEvent += (_, e) =>
{
if (e.Emission != EmissionKind.Activated) return;
_ = Task.Run(async () =>
{
try
{
await eng.AcknowledgeAsync(e.AlarmId, "sub", null, CancellationToken.None);
ackDone.TrySetResult();
}
catch (Exception ex) { ackDone.TrySetException(ex); }
});
};
up.Push("Temp", 150);
var winner = await Task.WhenAny(ackDone.Task, Task.Delay(TimeSpan.FromSeconds(3)));
winner.ShouldBe(ackDone.Task,
"subscriber re-entering the engine must not deadlock against _evalGate");
await ackDone.Task; // surface any inner exception
eng.GetState("HighTemp")!.Acked.ShouldBe(AlarmAckedState.Acknowledged);
}
finally
{
eng.Dispose();
}
}
[Fact]
public void OnEvent_emission_happens_outside_evalGate(/* -003 */)
{
// Direct white-box check on the gate-release ordering: AcknowledgeAsync
// emits the Acknowledged event AFTER releasing the gate. We assert that
// by observing the gate is acquirable from inside the subscriber.
// SemaphoreSlim.Wait(0) returns true only if the count > 0 (gate free).
var up = new FakeUpstream();
up.Set("Temp", 50);
var eng = Build(up, out _);
try
{
eng.LoadAsync([Alarm("HighTemp", """return (int)ctx.GetTag("Temp").Value > 100;""")],
TestContext.Current.CancellationToken).GetAwaiter().GetResult();
// Drive to Active so Acknowledge has something to ack.
up.Push("Temp", 150);
// Use the same WaitForAsync that other tests use — synchronously
// here since this is a non-async test.
for (var i = 0; i < 80 && eng.GetState("HighTemp")!.Active != AlarmActiveState.Active; i++)
Thread.Sleep(25);
eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active);
// Use reflection to peek at _evalGate so the subscriber can probe it.
var gateField = typeof(ScriptedAlarmEngine).GetField(
"_evalGate", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
gateField.ShouldNotBeNull();
var gate = (SemaphoreSlim)gateField.GetValue(eng)!;
var gateFreeInsideEmission = false;
eng.OnEvent += (_, e) =>
{
if (e.Emission != EmissionKind.Acknowledged) return;
// SemaphoreSlim.Wait(0) — non-blocking try-take. If the gate is
// free we acquire it (count back to 0); release immediately.
if (gate.Wait(0))
{
gateFreeInsideEmission = true;
gate.Release();
}
};
eng.AcknowledgeAsync("HighTemp", "alice", null, CancellationToken.None)
.GetAwaiter().GetResult();
gateFreeInsideEmission.ShouldBeTrue(
"_evalGate must be released before OnEvent fires so subscribers " +
"can call back into the engine without deadlocking");
}
finally
{
eng.Dispose();
}
}
// -------------------------------------------------------------------------
// Core.ScriptedAlarms-006: Dispose must drain in-flight background tasks
// launched by OnUpstreamChange / RunShelvingCheck. Otherwise a re-evaluation
// or shelving check started just before Dispose can keep running and write
// to a (possibly disposed) store after the engine has returned.
// -------------------------------------------------------------------------
[Fact]
public async Task Dispose_drains_in_flight_reevaluation_tasks(/* -006 */)
{
var up = new FakeUpstream();
up.Set("Temp", 50);
var logger = new LoggerConfiguration().CreateLogger();
var slowStore = new BlockingSaveAlarmStateStore();
var eng = new ScriptedAlarmEngine(up, slowStore, new ScriptLoggerFactory(logger), logger);
await eng.LoadAsync([Alarm("A", """return (int)ctx.GetTag("Temp").Value > 100;""")],
TestContext.Current.CancellationToken);
// Block the NEXT save (the one triggered by the push below).
var saveGate = new TaskCompletionSource();
slowStore.BlockNextSave = saveGate;
// Trigger a re-evaluation that will go inside _evalGate and call SaveAsync.
up.Push("Temp", 150);
// Wait until the store's SaveAsync is actually blocked.
await WaitForAsync(() => slowStore.SaveInProgress, timeoutMs: 1000);
// Dispose must wait for the in-flight reevaluation to complete rather
// than returning while a background task still runs.
var disposeTask = Task.Run(() => eng.Dispose());
// Verify Dispose does NOT complete immediately — it should block waiting
// for the in-flight task. Without the -006 fix Dispose returns straight
// away and the background reevaluation can outlive the engine.
var prematureFinish = await Task.WhenAny(disposeTask, Task.Delay(200));
prematureFinish.ShouldNotBe(disposeTask,
"Dispose must block until in-flight background tasks complete");
// Let the save complete and verify Dispose then returns.
saveGate.SetResult();
await disposeTask.WaitAsync(TimeSpan.FromSeconds(3), TestContext.Current.CancellationToken);
slowStore.SaveInProgress.ShouldBeFalse("background task drained before Dispose returned");
}
// -------------------------------------------------------------------------
// Core.ScriptedAlarms-010: predicate evaluation and message-template
// resolution apply different quality bars on purpose. Predicate evaluation
// accepts Uncertain (the predicate can still inspect the value); message
// resolution renders Uncertain as "{?}" so the operator sees the doubt
// explicitly. The two policies are documented in docs/ScriptedAlarms.md.
// -------------------------------------------------------------------------
[Fact]
public async Task Uncertain_quality_drives_predicate_but_renders_question_mark_in_message(/* -010 */)
{
var up = new FakeUpstream();
// Seed with Uncertain quality (severity bit 30 set, bit 31 clear).
up.Set("Temp", 150, statusCode: 0x40000000u);
using var eng = Build(up, out _);
await eng.LoadAsync([
new ScriptedAlarmDefinition(
"HighTemp", "Plant/Line1", "HighTemp",
AlarmKind.LimitAlarm, AlarmSeverity.High,
"Temp {Temp} exceeded limit",
"""return (int)ctx.GetTag("Temp").Value > 100;"""),
], TestContext.Current.CancellationToken);
// Predicate evaluated (Uncertain treated as ready) → alarm Active.
eng.GetState("HighTemp")!.Active.ShouldBe(AlarmActiveState.Active,
"AreInputsReady accepts Uncertain so the predicate runs");
// But the resolved emission message must show "{?}" for the Uncertain
// tag — only Good substitutes into the operator-facing message.
var events = new List<ScriptedAlarmEvent>();
eng.OnEvent += (_, e) => events.Add(e);
up.Push("Temp", 200, statusCode: 0x40000000u); // still Uncertain
// Trigger another evaluation to get an emission (already active, so
// we need a clear → re-activate cycle). Easier: force the same path
// through a comment which emits a CommentAdded message. But comments
// don't run the template. Instead clear it then re-activate.
up.Push("Temp", 50, statusCode: 0u); // Good, predicate becomes false
await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Cleared));
events.Clear();
up.Push("Temp", 200, statusCode: 0x40000000u); // Uncertain, predicate true
await WaitForAsync(() => events.Any(e => e.Emission == EmissionKind.Activated));
// The Activated message must show {?} for the Uncertain input.
events.Single(e => e.Emission == EmissionKind.Activated).Message
.ShouldBe("Temp {?} exceeded limit",
"MessageTemplate.Resolve renders non-Good StatusCode as {?} " +
"even though predicate evaluation accepted the Uncertain value");
}
// -------------------------------------------------------------------------
// Core.ScriptedAlarms-008: switch Comments to ImmutableList for O(log n)
// append. The persisted runtime type must be ImmutableList<AlarmComment>
// (which still satisfies IReadOnlyList<AlarmComment> for existing
// consumers).
// -------------------------------------------------------------------------
[Fact]
public async Task Comments_collection_uses_ImmutableList_for_efficient_append(/* -008 */)
{
var up = new FakeUpstream();
up.Set("Temp", 50);
using var eng = Build(up, out _);
await eng.LoadAsync([Alarm("A", "return false;")], TestContext.Current.CancellationToken);
// Add a comment so AppendComment runs.
await eng.AddCommentAsync("A", "alice", "note", TestContext.Current.CancellationToken);
var s = eng.GetState("A")!;
s.Comments.ShouldBeOfType<System.Collections.Immutable.ImmutableList<AlarmComment>>(
"Comments should be an ImmutableList so append is O(log n), not O(n)");
}
// -------------------------------------------------------------------------
// Core.ScriptedAlarms-011: TransitionResult.NoOp's reason parameter must be
// propagated, not silently discarded. The class-level remarks promise a
// diagnostic log line for no-op disabled-alarm evaluations.
// -------------------------------------------------------------------------
[Fact]
public void TransitionResult_NoOp_propagates_reason(/* -011 */)
{
var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow);
var r = TransitionResult.NoOp(fresh, "disabled — predicate result ignored");
r.NoOpReason.ShouldBe("disabled — predicate result ignored",
"NoOp reason must be preserved on the TransitionResult so callers can log it");
}
[Fact]
public void TransitionResult_None_carries_no_reason(/* -011 */)
{
var fresh = AlarmConditionState.Fresh("a-1", DateTime.UtcNow);
var r = TransitionResult.None(fresh);
r.NoOpReason.ShouldBeNull("None() factory has no reason — only NoOp() carries one");
}
private static async Task WaitForAsync(Func<bool> cond, int timeoutMs = 2000)
{
var deadline = DateTime.UtcNow.AddMilliseconds(timeoutMs);
@@ -645,4 +892,37 @@ public sealed class ScriptedAlarmEngineTests
public Task RemoveAsync(string alarmId, CancellationToken ct)
=> _inner.RemoveAsync(alarmId, ct);
}
/// <summary>
/// A store whose SaveAsync can be made to block until the test signals it.
/// Used to verify Dispose drains in-flight background tasks (finding -006).
/// </summary>
private sealed class BlockingSaveAlarmStateStore : IAlarmStateStore
{
private readonly InMemoryAlarmStateStore _inner = new();
public TaskCompletionSource? BlockNextSave { get; set; }
public bool SaveInProgress { get; private set; }
public Task<AlarmConditionState?> LoadAsync(string alarmId, CancellationToken ct)
=> _inner.LoadAsync(alarmId, ct);
public Task<IReadOnlyList<AlarmConditionState>> LoadAllAsync(CancellationToken ct)
=> _inner.LoadAllAsync(ct);
public async Task SaveAsync(AlarmConditionState state, CancellationToken ct)
{
var gate = BlockNextSave;
if (gate is not null)
{
BlockNextSave = null;
SaveInProgress = true;
try { await gate.Task.WaitAsync(ct).ConfigureAwait(false); }
finally { SaveInProgress = false; }
}
await _inner.SaveAsync(state, ct).ConfigureAwait(false);
}
public Task RemoveAsync(string alarmId, CancellationToken ct)
=> _inner.RemoveAsync(alarmId, ct);
}
}