From 0ae715cca4714b8d2aae0b42b5d87246104a3cfc Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 20 Apr 2026 16:38:43 -0400 Subject: [PATCH] =?UTF-8?q?Phase=207=20Stream=20A.2=20=E2=80=94=20compile?= =?UTF-8?q?=20cache=20+=20per-evaluation=20timeout=20wrapper.=20Second=20o?= =?UTF-8?q?f=203=20increments=20within=20Stream=20A.=20Adds=20two=20indepe?= =?UTF-8?q?ndent=20resilience=20primitives=20that=20the=20virtual-tag=20en?= =?UTF-8?q?gine=20(Stream=20B)=20and=20scripted-alarm=20engine=20(Stream?= =?UTF-8?q?=20C)=20will=20compose=20with=20the=20base=20ScriptEvaluator.?= =?UTF-8?q?=20Both=20are=20generic=20on=20(TContext,=20TResult)=20so=20dif?= =?UTF-8?q?ferent=20engines=20get=20their=20own=20instances=20without=20cr?= =?UTF-8?q?oss-contamination.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CompiledScriptCache — source-hash-keyed cache of compiled evaluators. Roslyn compilation is the most expensive step in the evaluator pipeline (5-20ms per script depending on size); re-compiling on every value-change event would starve the engine. ConcurrentDictionary of Lazy with ExecutionAndPublication mode ensures concurrent callers never double-compile even on a cold cache race. Failed compiles evict the cache entry so an Admin UI retry with corrected source actually recompiles (otherwise the cached exception would persist). Whitespace-sensitive hash — reformatting a script misses the cache on purpose, simpler than AST-canonicalize and happens rarely. No capacity bound because virtual-tag + alarm scripts are config-DB bounded (thousands, not millions); if scale pushes past that in v3 an LRU eviction slots in behind the same API. TimedScriptEvaluator — wraps a ScriptEvaluator with a per-evaluation wall-clock timeout (default 250ms per Phase 7 plan Stream A.4, configurable per tag so slower backends can widen). Critical implementation detail: the underlying Roslyn ScriptRunner executes synchronously on the calling thread for CPU-bound user scripts, returning an already-completed Task before the caller can register a timeout. Naive `Task.WaitAsync(timeout)` would see the completed task and never fire. Fix: push evaluation to a thread-pool thread via Task.Run, so the caller's thread is free to wait and the timeout reliably fires after the configured budget. Known trade-off (documented in the class summary): when a script times out, the underlying evaluation task continues running on the thread-pool thread until Roslyn returns; in the CPU-bound-infinite-loop case it's effectively leaked until the runtime decides to unwind. Tighter CPU budgeting would require an out-of-process script runner (v3 concern). In practice the timeout + structured warning log surfaces the offending script so the operator fixes it, and the orphan thread is rare. Caller-supplied CancellationToken is honored and takes precedence over the timeout, so driver-shutdown paths see a clean OperationCanceledException rather than a misclassified ScriptTimeoutException. ScriptTimeoutException carries the configured Timeout and a diagnostic message pointing the operator at ctx.Logger output around the failure plus suggesting widening the timeout, simplifying the script, or moving heavy work out of the evaluation path. The virtual-tag engine (Stream B) will catch this and map the owning tag's quality to BadInternalError per Phase 7 decision #11, logging a structured warning with the offending script name. Tests: CompiledScriptCacheTests (10) — first-call compile, identical-source dedupe to same instance, different-source produces different evaluator, whitespace-sensitivity documented, cached evaluator still runs correctly, failed compile evicted for retry, Clear drops entries, concurrent GetOrCompile of the same source deduplicates to one instance, different TContext/TResult use separate cache instances, null source rejected. TimedScriptEvaluatorTests (9) — fast script completes under timeout, CPU-bound script throws ScriptTimeoutException, caller cancellation takes precedence over timeout (shutdown path correctness), default 250ms per plan, zero/negative timeout rejected at construction, null inner rejected, null context rejected, user-thrown exceptions propagate unwrapped (not conflated with timeout), timeout exception message contains diagnostic guidance. Full suite: 48/48 green (29 from A.1 + 19 new). Next: Stream A.3 wires the dedicated scripts-*.log Serilog rolling sink + structured-property filtering + companion-WARN enricher to the main log, closing out Stream A. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../CompiledScriptCache.cs | 83 ++++++++++ .../TimedScriptEvaluator.cs | 102 ++++++++++++ .../CompiledScriptCacheTests.cs | 151 ++++++++++++++++++ .../TimedScriptEvaluatorTests.cs | 134 ++++++++++++++++ 4 files changed, 470 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core.Scripting/CompiledScriptCache.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core.Scripting/TimedScriptEvaluator.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/CompiledScriptCacheTests.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/TimedScriptEvaluatorTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/CompiledScriptCache.cs b/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/CompiledScriptCache.cs new file mode 100644 index 0000000..4b59278 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/CompiledScriptCache.cs @@ -0,0 +1,83 @@ +using System.Collections.Concurrent; +using System.Security.Cryptography; +using System.Text; + +namespace ZB.MOM.WW.OtOpcUa.Core.Scripting; + +/// +/// Source-hash-keyed compile cache for user scripts. Roslyn compilation is the most +/// expensive step in the evaluator pipeline (5-20ms per script depending on size); +/// re-compiling on every value-change event would starve the virtual-tag engine. +/// The cache is generic on the subclass + result type so +/// different engines (virtual-tag / alarm-predicate / future alarm-action) each get +/// their own cache instance — there's no cross-type pollution. +/// +/// +/// +/// Concurrent-safe: of +/// means a miss on two threads compiles exactly once. +/// guarantees other +/// threads block on the in-flight compile rather than racing to duplicate work. +/// +/// +/// Cache is keyed on SHA-256 of the UTF-8 bytes of the source — collision-free in +/// practice. Whitespace changes therefore miss the cache on purpose; operators +/// see re-compile time on their first evaluation after a format-only edit which +/// is rare and benign. +/// +/// +/// No capacity bound. Virtual-tag + alarm scripts are operator-authored and +/// bounded by config DB (typically low thousands). If that changes in v3, add an +/// LRU eviction policy — the API stays the same. +/// +/// +public sealed class CompiledScriptCache + where TContext : ScriptContext +{ + private readonly ConcurrentDictionary>> _cache = new(); + + /// + /// Return the compiled evaluator for , compiling + /// on first sight + reusing thereafter. If the source fails to compile, the + /// original Roslyn / sandbox exception propagates; the cache entry is removed so + /// the next call retries (useful during Admin UI authoring when the operator is + /// still fixing syntax). + /// + public ScriptEvaluator GetOrCompile(string scriptSource) + { + if (scriptSource is null) throw new ArgumentNullException(nameof(scriptSource)); + + var key = HashSource(scriptSource); + var lazy = _cache.GetOrAdd(key, _ => new Lazy>( + () => ScriptEvaluator.Compile(scriptSource), + LazyThreadSafetyMode.ExecutionAndPublication)); + + try + { + return lazy.Value; + } + catch + { + // Failed compile — evict so a retry with corrected source can succeed. + _cache.TryRemove(key, out _); + throw; + } + } + + /// Current entry count. Exposed for Admin UI diagnostics / tests. + public int Count => _cache.Count; + + /// Drop every cached compile. Used on config generation publish + tests. + public void Clear() => _cache.Clear(); + + /// True when the exact source has been compiled at least once + is still cached. + public bool Contains(string scriptSource) + => _cache.ContainsKey(HashSource(scriptSource)); + + private static string HashSource(string source) + { + var bytes = Encoding.UTF8.GetBytes(source); + var hash = SHA256.HashData(bytes); + return Convert.ToHexString(hash); + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/TimedScriptEvaluator.cs b/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/TimedScriptEvaluator.cs new file mode 100644 index 0000000..237435f --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core.Scripting/TimedScriptEvaluator.cs @@ -0,0 +1,102 @@ +namespace ZB.MOM.WW.OtOpcUa.Core.Scripting; + +/// +/// Wraps a with a per-evaluation +/// wall-clock timeout. Default is 250ms per Phase 7 plan Stream A.4; configurable +/// per tag so deployments with slower backends can widen it. +/// +/// +/// +/// Implemented with +/// rather than a cancellation-token-only approach because Roslyn-compiled +/// scripts don't internally poll the cancellation token unless the user code +/// does async work. A CPU-bound infinite loop in a script won't honor a +/// cooperative cancel — WaitAsync returns control when the timeout fires +/// regardless of whether the inner task completes. +/// +/// +/// Known limitation: when a script times out, the underlying ScriptRunner +/// task continues running on a thread-pool thread until the Roslyn runtime +/// returns. In the CPU-bound-infinite-loop case that's effectively "leaked" — +/// the thread is tied up until the runtime decides to return, which it may +/// never do. Phase 7 plan Stream A.4 accepts this as a known trade-off; tighter +/// CPU budgeting would require an out-of-process script runner, which is a v3 +/// concern. In practice, the timeout + structured warning log surfaces the +/// offending script so the operator can fix it; the orphan thread is rare. +/// +/// +/// Caller-supplied is honored — if the caller +/// cancels before the timeout fires, the caller's cancel wins and the +/// propagates (not wrapped as +/// ). That distinction matters: the +/// virtual-tag engine's shutdown path cancels scripts on dispose; it shouldn't +/// see those as timeouts. +/// +/// +public sealed class TimedScriptEvaluator + where TContext : ScriptContext +{ + /// Default timeout per Phase 7 plan Stream A.4 — 250ms. + public static readonly TimeSpan DefaultTimeout = TimeSpan.FromMilliseconds(250); + + private readonly ScriptEvaluator _inner; + + /// Wall-clock budget per evaluation. Script exceeding this throws . + public TimeSpan Timeout { get; } + + public TimedScriptEvaluator(ScriptEvaluator inner) + : this(inner, DefaultTimeout) + { + } + + public TimedScriptEvaluator(ScriptEvaluator inner, TimeSpan timeout) + { + _inner = inner ?? throw new ArgumentNullException(nameof(inner)); + if (timeout <= TimeSpan.Zero) + throw new ArgumentOutOfRangeException(nameof(timeout), "Timeout must be positive."); + Timeout = timeout; + } + + public async Task RunAsync(TContext context, CancellationToken ct = default) + { + if (context is null) throw new ArgumentNullException(nameof(context)); + + // Push evaluation to a thread-pool thread so a CPU-bound script (e.g. a tight + // loop with no async work) doesn't hog the caller's thread before WaitAsync + // gets to register its timeout. Without this, Roslyn's ScriptRunner executes + // synchronously on the calling thread and returns an already-completed Task, + // so WaitAsync sees a completed task and never fires the timeout. + var runTask = Task.Run(() => _inner.RunAsync(context, ct), ct); + try + { + return await runTask.WaitAsync(Timeout, ct).ConfigureAwait(false); + } + catch (TimeoutException) + { + // WaitAsync's synthesized timeout — the inner task may still be running + // on its thread-pool thread (known leak documented in the class summary). + // Wrap so callers can distinguish from user-written timeout logic. + throw new ScriptTimeoutException(Timeout); + } + } +} + +/// +/// Thrown when a script evaluation exceeds its configured timeout. The virtual-tag +/// engine (Stream B) catches this + maps the owning tag's quality to +/// BadInternalError per Phase 7 plan decision #11, logging a structured +/// warning with the offending script name so operators can locate + fix it. +/// +public sealed class ScriptTimeoutException : Exception +{ + public TimeSpan Timeout { get; } + + public ScriptTimeoutException(TimeSpan timeout) + : base($"Script evaluation exceeded the configured timeout of {timeout.TotalMilliseconds:F1} ms. " + + "The script was either CPU-bound or blocked on a slow operation; check ctx.Logger output " + + "around the timeout and consider widening the timeout per tag, simplifying the script, or " + + "moving heavy work out of the evaluation path.") + { + Timeout = timeout; + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/CompiledScriptCacheTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/CompiledScriptCacheTests.cs new file mode 100644 index 0000000..3fa9ce7 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/CompiledScriptCacheTests.cs @@ -0,0 +1,151 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Scripting; + +namespace ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests; + +/// +/// Exercises the source-hash keyed compile cache. Roslyn compilation is the most +/// expensive step in the evaluator pipeline; this cache collapses redundant +/// compiles of unchanged scripts to zero-cost lookups + makes sure concurrent +/// callers never double-compile. +/// +[Trait("Category", "Unit")] +public sealed class CompiledScriptCacheTests +{ + private sealed class CompileCountingGate + { + public int Count; + } + + [Fact] + public void First_call_compiles_and_caches() + { + var cache = new CompiledScriptCache(); + cache.Count.ShouldBe(0); + + var e = cache.GetOrCompile("""return 42;"""); + e.ShouldNotBeNull(); + cache.Count.ShouldBe(1); + cache.Contains("""return 42;""").ShouldBeTrue(); + } + + [Fact] + public void Identical_source_returns_the_same_compiled_evaluator() + { + var cache = new CompiledScriptCache(); + var first = cache.GetOrCompile("""return 1;"""); + var second = cache.GetOrCompile("""return 1;"""); + ReferenceEquals(first, second).ShouldBeTrue(); + cache.Count.ShouldBe(1); + } + + [Fact] + public void Different_source_produces_different_evaluator() + { + var cache = new CompiledScriptCache(); + var a = cache.GetOrCompile("""return 1;"""); + var b = cache.GetOrCompile("""return 2;"""); + ReferenceEquals(a, b).ShouldBeFalse(); + cache.Count.ShouldBe(2); + } + + [Fact] + public void Whitespace_difference_misses_cache() + { + // Documented behavior: reformatting a script recompiles. Simpler + cheaper + // than the alternative (AST-canonicalize then hash) and doesn't happen often. + var cache = new CompiledScriptCache(); + cache.GetOrCompile("""return 1;"""); + cache.GetOrCompile("return 1; "); // trailing whitespace — different hash + cache.Count.ShouldBe(2); + } + + [Fact] + public async Task Cached_evaluator_still_runs_correctly() + { + var cache = new CompiledScriptCache(); + var e = cache.GetOrCompile("""return (double)ctx.GetTag("In").Value * 3.0;"""); + var ctx = new FakeScriptContext().Seed("In", 7.0); + + // Run twice through the cache — both must return the same correct value. + var first = await e.RunAsync(ctx, TestContext.Current.CancellationToken); + var second = await cache.GetOrCompile("""return (double)ctx.GetTag("In").Value * 3.0;""") + .RunAsync(ctx, TestContext.Current.CancellationToken); + first.ShouldBe(21.0); + second.ShouldBe(21.0); + } + + [Fact] + public void Failed_compile_is_evicted_so_retry_with_corrected_source_works() + { + var cache = new CompiledScriptCache(); + + // First attempt — undefined identifier, compile throws. + Should.Throw(() => cache.GetOrCompile("""return unknownIdentifier + 1;""")); + cache.Count.ShouldBe(0, "failed compile must be evicted so retry can re-attempt"); + + // Retry with corrected source succeeds + caches. + cache.GetOrCompile("""return 42;""").ShouldNotBeNull(); + cache.Count.ShouldBe(1); + } + + [Fact] + public void Clear_drops_every_entry() + { + var cache = new CompiledScriptCache(); + cache.GetOrCompile("""return 1;"""); + cache.GetOrCompile("""return 2;"""); + cache.Count.ShouldBe(2); + + cache.Clear(); + cache.Count.ShouldBe(0); + cache.Contains("""return 1;""").ShouldBeFalse(); + } + + [Fact] + public void Concurrent_compiles_of_the_same_source_deduplicate() + { + // LazyThreadSafetyMode.ExecutionAndPublication guarantees only one compile + // even when multiple threads race GetOrCompile against an empty cache. + // We can't directly count Roslyn compilations — but we can assert all + // concurrent callers see the same evaluator instance. + var cache = new CompiledScriptCache(); + const string src = """return 99;"""; + + var tasks = Enumerable.Range(0, 20) + .Select(_ => Task.Run(() => cache.GetOrCompile(src))) + .ToArray(); + Task.WhenAll(tasks).GetAwaiter().GetResult(); + + var firstInstance = tasks[0].Result; + foreach (var t in tasks) + ReferenceEquals(t.Result, firstInstance).ShouldBeTrue(); + cache.Count.ShouldBe(1); + } + + [Fact] + public void Different_TContext_TResult_pairs_use_separate_cache_instances() + { + // Documented: each engine (virtual-tag / alarm-predicate / alarm-action) owns + // its own cache. The type-parametric design makes this the default without + // cross-contamination at the dictionary level. + var intCache = new CompiledScriptCache(); + var boolCache = new CompiledScriptCache(); + + intCache.GetOrCompile("""return 1;"""); + boolCache.GetOrCompile("""return true;"""); + + intCache.Count.ShouldBe(1); + boolCache.Count.ShouldBe(1); + intCache.Contains("""return true;""").ShouldBeFalse(); + boolCache.Contains("""return 1;""").ShouldBeFalse(); + } + + [Fact] + public void Null_source_throws_ArgumentNullException() + { + var cache = new CompiledScriptCache(); + Should.Throw(() => cache.GetOrCompile(null!)); + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/TimedScriptEvaluatorTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/TimedScriptEvaluatorTests.cs new file mode 100644 index 0000000..765f0f1 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests/TimedScriptEvaluatorTests.cs @@ -0,0 +1,134 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Scripting; + +namespace ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests; + +/// +/// Verifies the per-evaluation timeout wrapper. Fast scripts complete normally; +/// CPU-bound or hung scripts throw instead of +/// starving the engine. Caller-supplied cancellation tokens take precedence over the +/// timeout so driver-shutdown paths see a clean cancel rather than a timeout. +/// +[Trait("Category", "Unit")] +public sealed class TimedScriptEvaluatorTests +{ + [Fact] + public async Task Fast_script_completes_under_timeout_and_returns_value() + { + var inner = ScriptEvaluator.Compile( + """return (double)ctx.GetTag("In").Value + 1.0;"""); + var timed = new TimedScriptEvaluator( + inner, TimeSpan.FromSeconds(1)); + + var ctx = new FakeScriptContext().Seed("In", 41.0); + var result = await timed.RunAsync(ctx, TestContext.Current.CancellationToken); + result.ShouldBe(42.0); + } + + [Fact] + public async Task Script_longer_than_timeout_throws_ScriptTimeoutException() + { + // Scripts can't easily do Thread.Sleep in the sandbox (System.Threading.Thread + // is denied). But a tight CPU loop exceeds any short timeout. + var inner = ScriptEvaluator.Compile( + """ + var end = Environment.TickCount64 + 5000; + while (Environment.TickCount64 < end) { } + return 1; + """); + var timed = new TimedScriptEvaluator( + inner, TimeSpan.FromMilliseconds(50)); + + var ex = await Should.ThrowAsync(async () => + await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken)); + ex.Timeout.ShouldBe(TimeSpan.FromMilliseconds(50)); + ex.Message.ShouldContain("50.0"); + } + + [Fact] + public async Task Caller_cancellation_takes_precedence_over_timeout() + { + // A CPU-bound script that would otherwise timeout; external ct fires first. + // Expected: OperationCanceledException (not ScriptTimeoutException) so shutdown + // paths aren't misclassified as timeouts. + var inner = ScriptEvaluator.Compile( + """ + var end = Environment.TickCount64 + 10000; + while (Environment.TickCount64 < end) { } + return 1; + """); + var timed = new TimedScriptEvaluator( + inner, TimeSpan.FromSeconds(5)); + + using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(80)); + await Should.ThrowAsync(async () => + await timed.RunAsync(new FakeScriptContext(), cts.Token)); + } + + [Fact] + public void Default_timeout_is_250ms_per_plan() + { + TimedScriptEvaluator.DefaultTimeout + .ShouldBe(TimeSpan.FromMilliseconds(250)); + } + + [Fact] + public void Zero_or_negative_timeout_is_rejected_at_construction() + { + var inner = ScriptEvaluator.Compile("""return 1;"""); + Should.Throw(() => + new TimedScriptEvaluator(inner, TimeSpan.Zero)); + Should.Throw(() => + new TimedScriptEvaluator(inner, TimeSpan.FromMilliseconds(-1))); + } + + [Fact] + public void Null_inner_is_rejected() + { + Should.Throw(() => + new TimedScriptEvaluator(null!)); + } + + [Fact] + public void Null_context_is_rejected() + { + var inner = ScriptEvaluator.Compile("""return 1;"""); + var timed = new TimedScriptEvaluator(inner); + Should.ThrowAsync(async () => + await timed.RunAsync(null!, TestContext.Current.CancellationToken)); + } + + [Fact] + public async Task Script_exception_propagates_unwrapped() + { + // User-thrown exceptions must come through as-is — NOT wrapped in + // ScriptTimeoutException. The virtual-tag engine catches them per-tag and + // maps to BadInternalError; conflating with timeout would lose that info. + var inner = ScriptEvaluator.Compile( + """throw new InvalidOperationException("script boom");"""); + var timed = new TimedScriptEvaluator(inner, TimeSpan.FromSeconds(1)); + + var ex = await Should.ThrowAsync(async () => + await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken)); + ex.Message.ShouldBe("script boom"); + } + + [Fact] + public async Task ScriptTimeoutException_message_points_at_diagnostic_path() + { + var inner = ScriptEvaluator.Compile( + """ + var end = Environment.TickCount64 + 5000; + while (Environment.TickCount64 < end) { } + return 1; + """); + var timed = new TimedScriptEvaluator( + inner, TimeSpan.FromMilliseconds(30)); + + var ex = await Should.ThrowAsync(async () => + await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken)); + ex.Message.ShouldContain("ctx.Logger"); + ex.Message.ShouldContain("widening the timeout"); + } +}