merge: integrate WaitAsync/M5-audit (parallel session) with galaxy array-write + inbound-timeout fixes

This commit is contained in:
Joseph Doherty
2026-06-17 09:28:15 -04:00
88 changed files with 7714 additions and 169 deletions
@@ -571,7 +571,20 @@ public class AlarmActor : ReceiveActor
/// Passes the firing alarm's level/priority/message so the script can
/// branch on severity via the <c>Alarm</c> global.
/// </summary>
private void SpawnAlarmExecution(AlarmLevel level, int priority, string message)
/// <param name="level">The firing alarm severity level.</param>
/// <param name="priority">The firing alarm priority.</param>
/// <param name="message">The firing alarm message.</param>
/// <param name="parentExecutionId">
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
/// the context that fired this alarm, recorded as the on-trigger script run's
/// <c>ParentExecutionId</c> so the alarm-triggered run chains under its firing
/// context in the audit tree. The alarm subsystem currently has no Guid-typed
/// firing id, so the only call sites pass <c>null</c> (the on-trigger run is a
/// root). The parameter exists so a future firing-id can flow without
/// touching the actor wiring.
/// </param>
private void SpawnAlarmExecution(
AlarmLevel level, int priority, string message, Guid? parentExecutionId = null)
{
if (_onTriggerCompiledScript == null) return;
@@ -591,7 +604,9 @@ public class AlarmActor : ReceiveActor
_options,
_logger,
// M2.5 (#9): per-script timeout from the on-trigger script (null = global).
_onTriggerExecutionTimeoutSeconds));
_onTriggerExecutionTimeoutSeconds,
// Audit Log #23 (M5.4): the firing context's execution id (null today).
parentExecutionId));
Context.ActorOf(props, executionId);
}
@@ -29,6 +29,14 @@ public class AlarmExecutionActor : ReceiveActor
/// <param name="options">Site runtime configuration options, including the execution timeout.</param>
/// <param name="logger">Logger for execution diagnostics.</param>
/// <param name="executionTimeoutSeconds">M2.5 (#9): the on-trigger script's per-script execution timeout in seconds. Null or non-positive falls back to the global <see cref="SiteRuntimeOptions.ScriptExecutionTimeoutSeconds"/>.</param>
/// <param name="parentExecutionId">
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
/// the context that fired this alarm, threaded into the on-trigger script's
/// <see cref="ScriptRuntimeContext"/> as its <c>ParentExecutionId</c> so the
/// alarm-triggered run chains under its firing context. Null today (no
/// Guid-typed firing id exists yet) — the run is a root, but the plumbing
/// is in place for a future firing id.
/// </param>
public AlarmExecutionActor(
string alarmName,
string instanceName,
@@ -42,7 +50,9 @@ public class AlarmExecutionActor : ReceiveActor
ILogger logger,
// M2.5 (#9): per-script execution timeout override (seconds) for the
// alarm on-trigger script. Null or non-positive falls back to the global.
int? executionTimeoutSeconds = null)
int? executionTimeoutSeconds = null,
// Audit Log #23 (M5.4): the firing context's execution id (null today).
Guid? parentExecutionId = null)
{
var self = Self;
var parent = Context.Parent;
@@ -51,7 +61,7 @@ public class AlarmExecutionActor : ReceiveActor
alarmName, instanceName, level, priority, message,
compiledScript, instanceActor,
sharedScriptLibrary, options, self, parent, logger,
executionTimeoutSeconds);
executionTimeoutSeconds, parentExecutionId);
}
private static void ExecuteAlarmScript(
@@ -67,7 +77,8 @@ public class AlarmExecutionActor : ReceiveActor
IActorRef self,
IActorRef parent,
ILogger logger,
int? executionTimeoutSeconds)
int? executionTimeoutSeconds,
Guid? parentExecutionId)
{
// M2.5 (#9): per-script timeout overrides the global default. A null or
// non-positive per-script value (≤ 0) falls back to the global.
@@ -95,7 +106,19 @@ public class AlarmExecutionActor : ReceiveActor
options.MaxScriptCallDepth,
timeout,
instanceName,
logger);
logger,
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the
// alarm on-trigger run mints its own fresh ExecutionId (the
// ctor's `?? NewGuid()` fallback) and records the firing
// context's id as its ParentExecutionId — null today, so the
// run is a root, but the plumbing exists for a future
// firing id.
parentExecutionId: parentExecutionId,
// WaitForAttribute (spec §4.4): thread the alarm on-trigger
// script's per-script execution-timeout token so a
// Attributes.WaitAsync inside an on-trigger script is bounded
// by the same script deadline.
scriptTimeoutToken: cts.Token);
var globals = new ScriptGlobals
{
@@ -149,6 +149,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Receive<RouteToCallRequest>(RouteInboundApiCall);
Receive<RouteToGetAttributesRequest>(RouteInboundApiGetAttributes);
Receive<RouteToSetAttributesRequest>(RouteInboundApiSetAttributes);
Receive<RouteToWaitForAttributeRequest>(RouteInboundApiWaitForAttribute);
// OPC UA Tag Browser — singleton-only re-forward to local /user/dcl-manager.
// BrowseNodeCommand is routed to this singleton (active node) by
@@ -1078,6 +1079,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}).PipeTo(sender);
}
/// <summary>
/// Spec §6 (WD-2b): unpacks a routed <see cref="RouteToWaitForAttributeRequest"/>
/// (inbound-API <c>Route.To().WaitForAttribute()</c>) into the deployed
/// Instance Actor's site-local <see cref="WaitForAttributeRequest"/> and relays
/// the result back. Value-equality only across the wire — the predicate is null
/// and <c>RequireGoodQuality</c> is left at its default. The Ask is bounded by the
/// wait timeout plus slack (NOT a fixed 30s), since the wait legitimately blocks
/// for up to <see cref="RouteToWaitForAttributeRequest.Timeout"/>.
/// </summary>
private void RouteInboundApiWaitForAttribute(RouteToWaitForAttributeRequest request)
{
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
{
Sender.Tell(new RouteToWaitForAttributeResponse(
request.CorrelationId, false, null, null, false,
false, $"Instance '{request.InstanceUniqueName}' not found on this site.",
DateTimeOffset.UtcNow));
return;
}
var sender = Sender;
// Routed waits are value-equality only (predicate null); RequireGoodQuality left at default.
var inner = new WaitForAttributeRequest(
request.CorrelationId, request.InstanceUniqueName, request.AttributeName,
request.TargetValueEncoded, null, request.Timeout, DateTimeOffset.UtcNow);
// Ask bounded by the WAIT timeout + slack — NOT a fixed 30s (the wait legitimately blocks up to request.Timeout).
instanceActor.Ask<WaitForAttributeResponse>(inner, request.Timeout + TimeSpan.FromSeconds(5))
.ContinueWith(t => t.IsCompletedSuccessfully
? new RouteToWaitForAttributeResponse(
request.CorrelationId, t.Result.Matched, t.Result.Value, t.Result.Quality, t.Result.TimedOut,
true, null, DateTimeOffset.UtcNow)
: new RouteToWaitForAttributeResponse(
request.CorrelationId, false, null, null, false,
false, t.Exception?.GetBaseException().Message ?? "Attribute wait timed out",
DateTimeOffset.UtcNow))
.PipeTo(sender);
}
/// <summary>
/// Writes attribute values on a deployed instance for a Route.To().SetAttribute(s)
/// call (or a central Test Run bound to the instance). Each write is Ask'd to the
@@ -68,6 +68,18 @@ public class InstanceActor : ReceiveActor
// mirroring the rest of the actor's by-name dictionaries).
private readonly Dictionary<string, ResolvedAttribute> _resolvedAttributeByName = new();
// WaitForAttribute (spec §4.2): one-shot waiter registry keyed by the
// request CorrelationId. Each entry holds the watched attribute name, the
// match test (decoded target equality OR a site-local predicate), the
// original Sender to reply to, and the scheduled-timeout handle so a match
// can cancel it. Single-threaded actor access — no locking needed.
private readonly Dictionary<string, PendingWait> _attributeWaiters = new();
// WaitForAttribute: defensive per-instance cap so a script leaking waiters
// in a loop cannot grow the registry without bound. Exceeding it refuses the
// wait with an error reply rather than registering.
private const int MaxAttributeWaiters = 100;
// DCL manager actor reference for subscribing to tag values
private readonly IActorRef? _dclManager;
// Maps each tag path to every attribute canonical name that references it.
@@ -170,6 +182,12 @@ public class InstanceActor : ReceiveActor
// WP-22/23: Handle attribute value changes from DCL (Tell pattern)
Receive<AttributeValueChanged>(HandleAttributeValueChanged);
// WaitForAttribute (spec §4.2): event-driven "wait for value" waiter
// registration + its scheduled-timeout self-message. Both flow only
// site-locally (the predicate variant carries a non-serializable delegate).
Receive<WaitForAttributeRequest>(HandleWaitForAttribute);
Receive<WaitForAttributeTimeout>(HandleWaitForAttributeTimeout);
// Handle tag value updates from DCL — convert to AttributeValueChanged
Receive<TagValueUpdate>(HandleTagValueUpdate);
Receive<SubscribeTagsResponse>(_ => { }); // Ack from DCL subscribe — no action needed
@@ -519,6 +537,114 @@ public class InstanceActor : ReceiveActor
PublishAndNotifyChildren(changed);
}
/// <summary>
/// WaitForAttribute (spec §4.2): registers a one-shot event-driven waiter for
/// an attribute to reach a value (encoded-equality), satisfy a site-local
/// predicate, or change at all. The current-value fast-path and the
/// change-handling in <see cref="HandleAttributeValueChanged"/> both run on
/// this single-threaded actor, so a value that flips between "read current"
/// and "register" cannot be missed (spec §5).
/// </summary>
private void HandleWaitForAttribute(WaitForAttributeRequest req)
{
// Capture the sender immediately — Sender is invalid once we schedule /
// return and a later message arrives.
var replyer = Sender;
// Build the match test: explicit predicate wins; else null encoded target
// means "any change"; else compare the codec-encoded current value to the
// encoded target (avoids needing the attribute's DataType to decode).
Func<object?, bool> test;
if (req.Predicate is not null)
{
test = req.Predicate;
}
else if (req.TargetValueEncoded is null)
{
test = _ => true;
}
else
{
var target = req.TargetValueEncoded;
test = v => string.Equals(
AttributeValueCodec.Encode(v), target, StringComparison.Ordinal);
}
// Fast path: the current value already satisfies the test → reply now.
// A script-supplied predicate (or the codec-equality lambda) runs on the
// actor thread; guard it so a throwing predicate cannot crash the actor or
// leak a never-resolved waiter. On throw: reply non-matched + ErrorMessage
// and return WITHOUT registering (no timeout scheduled).
if (_attributes.TryGetValue(req.AttributeName, out var current))
{
// Effective quality used for BOTH the §4.2 quality gate and the match
// reply — the same `?? "Good"` default the reply has always used.
_attributeQualities.TryGetValue(req.AttributeName, out var fastQuality);
var effectiveQuality = fastQuality ?? "Good";
bool fastMatch;
try
{
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
// in quality-gated mode a value already at target but at Bad/Uncertain
// quality is NOT a fast match — it falls through to register + schedule
// the timeout like any other pending waiter (do NOT fast-reply matched).
fastMatch =
(!req.RequireGoodQuality
|| string.Equals(effectiveQuality, "Good", StringComparison.Ordinal))
&& test(current);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"WaitForAttribute predicate threw on the fast-path for {Instance}.{Attribute}; refusing the wait",
_instanceUniqueName, req.AttributeName);
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Wait predicate threw: " + ex.Message));
return;
}
if (fastMatch)
{
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: true, current, effectiveQuality, TimedOut: false));
return;
}
}
// Defensive cap: refuse rather than register if the instance already has
// too many concurrent waiters (guards against a script leaking waiters).
if (_attributeWaiters.Count >= MaxAttributeWaiters)
{
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Too many concurrent attribute waiters on this instance"));
return;
}
// Register and schedule the self-evicting timeout (NativeAlarmActor idiom).
var handle = Context.System.Scheduler.ScheduleTellOnceCancelable(
req.Timeout, Self, new WaitForAttributeTimeout(req.CorrelationId), Self);
_attributeWaiters[req.CorrelationId] =
new PendingWait(req.AttributeName, test, replyer, handle, req.RequireGoodQuality);
}
/// <summary>
/// WaitForAttribute (spec §4.2): the scheduled timeout fired for a waiter that
/// never matched. If still registered (a match would have removed + canceled
/// it), reply TimedOut and evict it.
/// </summary>
private void HandleWaitForAttributeTimeout(WaitForAttributeTimeout msg)
{
if (_attributeWaiters.Remove(msg.CorrelationId, out var pending))
{
pending.Replyer.Tell(new WaitForAttributeResponse(
msg.CorrelationId, Matched: false, null, null, TimedOut: true));
}
}
/// <summary>
/// Handles tag value updates from DCL. Maps the tag path back to the attribute
/// canonical name and converts to an AttributeValueChanged for unified processing.
@@ -556,9 +682,14 @@ public class InstanceActor : ReceiveActor
_attributeQualities[attrName] = "Bad";
_attributeTimestamps[attrName] = update.Timestamp;
var currentValue = _attributes.GetValueOrDefault(attrName);
// WaitForAttribute (spec §4.2): quality-only republish — the
// stored value is UNCHANGED (we publish the OLD currentValue, only
// the quality flips to Bad). Do NOT evaluate waiters, or an
// "any-change" / unchanged-value-equality waiter would fire on a
// non-change.
PublishAndNotifyChildren(new AttributeValueChanged(
_instanceUniqueName, update.TagPath, attrName,
currentValue, "Bad", update.Timestamp));
currentValue, "Bad", update.Timestamp), evaluateWaiters: false);
}
continue;
}
@@ -908,7 +1039,17 @@ public class InstanceActor : ReceiveActor
/// Publishes attribute change to stream and notifies child Script/Alarm actors.
/// WP-22: Tell for attribute notifications (fire-and-forget, never blocks).
/// </summary>
private void PublishAndNotifyChildren(AttributeValueChanged changed)
/// <param name="changed">The attribute change to publish.</param>
/// <param name="evaluateWaiters">
/// WaitForAttribute (spec §4.2): when <c>true</c> (the default), registered
/// <c>Attributes.WaitAsync</c> waiters on this attribute are re-evaluated against
/// <paramref name="changed"/>'s value. Pass <c>false</c> on republish/quality-only
/// paths that do NOT assign a new value to <c>_attributes[name]</c> (e.g. the
/// List-coerce-failure Bad-quality republish, which publishes the OLD value) —
/// otherwise an "any-change" waiter (or a waiter whose target equals the unchanged
/// value) would spuriously fire even though nothing actually changed.
/// </param>
private void PublishAndNotifyChildren(AttributeValueChanged changed, bool evaluateWaiters = true)
{
// WP-23: Publish to site-wide stream
_streamManager?.PublishAttributeValueChanged(changed);
@@ -924,6 +1065,83 @@ public class InstanceActor : ReceiveActor
{
alarmActor.Tell(changed);
}
// WaitForAttribute (spec §4.2): re-evaluate any waiters on THIS attribute —
// but ONLY when this publish reflects a real value change (evaluateWaiters).
// The genuine value-change paths (HandleAttributeValueChanged, the scalar
// DCL update path, HandleSetStaticAttributeCore) call it AFTER assigning
// _attributes[name], so changed.Value is the just-applied current value.
// Republish/quality-only paths (List-coerce-failure Bad-quality, which
// publishes the OLD value) pass evaluateWaiters:false so an "any-change" or
// unchanged-value-equality waiter does not spuriously fire (spec §4.2).
// Iterate a snapshot so satisfied waiters can be removed during the loop;
// each match cancels its scheduled timeout (so no stray WaitForAttributeTimeout
// follows) and replies Matched=true.
if (evaluateWaiters)
ResolveMatchedWaiters(changed);
}
/// <summary>
/// WaitForAttribute (spec §4.2): fires every registered waiter on
/// <paramref name="changed"/>'s attribute whose test now passes against the
/// just-applied value — cancelling its timeout, replying Matched, and removing
/// it from the registry. A no-op when there are no waiters.
///
/// <para>
/// Each waiter's match test runs inside a per-waiter try/catch: a throwing
/// script-supplied predicate (or codec lambda) must NOT abort the loop and
/// strand sibling waiters on the same attribute, nor leave the throwing waiter
/// registered with a live scheduled timeout. On throw we cancel that waiter's
/// timeout, reply non-matched + ErrorMessage, remove it, and continue.
/// </para>
/// </summary>
private void ResolveMatchedWaiters(AttributeValueChanged changed)
{
if (_attributeWaiters.Count == 0)
return;
// Snapshot the candidate waiters on THIS attribute. Iterating a snapshot
// (and NOT evaluating the test inside the LINQ filter) keeps removal mid-loop
// safe and ensures one throwing test cannot abort materialization for siblings.
var candidates = _attributeWaiters
.Where(kvp => kvp.Value.AttributeName == changed.AttributeName)
.ToList();
foreach (var (cid, pending) in candidates)
{
bool matched;
try
{
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
// in quality-gated mode a value reaching the target at Bad/Uncertain
// quality is NOT a match — the waiter stays pending until it satisfies
// the test at Good quality (or times out).
matched =
(!pending.RequireGoodQuality
|| string.Equals(changed.Quality, "Good", StringComparison.Ordinal))
&& pending.Test(changed.Value);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"WaitForAttribute predicate threw while resolving waiter {CorrelationId} on {Instance}.{Attribute}; evicting it",
cid, _instanceUniqueName, changed.AttributeName);
pending.Timeout.Cancel();
pending.Replyer.Tell(new WaitForAttributeResponse(
cid, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Wait predicate threw: " + ex.Message));
_attributeWaiters.Remove(cid);
continue;
}
if (!matched)
continue;
pending.Timeout.Cancel();
pending.Replyer.Tell(new WaitForAttributeResponse(
cid, Matched: true, changed.Value, changed.Quality, TimedOut: false));
_attributeWaiters.Remove(cid);
}
}
/// <summary>
@@ -1202,4 +1420,23 @@ public class InstanceActor : ReceiveActor
/// Internal message for async override loading result.
/// </summary>
internal record LoadOverridesResult(Dictionary<string, string> Overrides, string? Error);
/// <summary>
/// WaitForAttribute (spec §4.2): one registered, not-yet-satisfied waiter.
/// </summary>
/// <param name="AttributeName">The attribute this waiter watches (scope-resolved).</param>
/// <param name="Test">The match test (decoded-target equality OR site-local predicate OR any-change).</param>
/// <param name="Replyer">The original sender to reply to on match / timeout.</param>
/// <param name="Timeout">The scheduled timeout handle, canceled on match.</param>
/// <param name="RequireGoodQuality">
/// Quality-gated ("Good"-only) mode (spec §4.2): when <c>true</c>, the resolve
/// loop additionally requires <c>changed.Quality == "Good"</c> before the test
/// can match.
/// </param>
private sealed record PendingWait(
string AttributeName,
Func<object?, bool> Test,
IActorRef Replyer,
ICancelable Timeout,
bool RequireGoodQuality);
}
@@ -221,7 +221,12 @@ public class ScriptExecutionActor : ReceiveActor
// M2.12 (#25): thread the singleton site event logger so
// recursion-limit violations at CallScript/CallShared emit a
// script Error site event in addition to ILogger.LogError.
siteEventLogger: siteEventLogger);
siteEventLogger: siteEventLogger,
// WaitForAttribute (spec §4.3/§4.4): thread the per-script
// execution-timeout token so Attributes.WaitAsync's Ask is
// bounded by the script's own ExecutionTimeoutSeconds — a
// shorter script deadline wins over the wait's own timeout.
scriptTimeoutToken: cts.Token);
var globals = new ScriptGlobals
{