Merge pull request 'Phase 7 Stream A.2 — compile cache + per-evaluation timeout wrapper' (#178) from phase-7-stream-a2-cache-timeout into v2

This commit was merged in pull request #178.
This commit is contained in:
2026-04-20 16:41:07 -04:00
4 changed files with 470 additions and 0 deletions

View File

@@ -0,0 +1,83 @@
using System.Collections.Concurrent;
using System.Security.Cryptography;
using System.Text;
namespace ZB.MOM.WW.OtOpcUa.Core.Scripting;
/// <summary>
/// Source-hash-keyed compile cache for user scripts. Roslyn compilation is the most
/// expensive step in the evaluator pipeline (5-20ms per script depending on size);
/// re-compiling on every value-change event would starve the virtual-tag engine.
/// The cache is generic on the <see cref="ScriptContext"/> subclass + result type so
/// different engines (virtual-tag / alarm-predicate / future alarm-action) each get
/// their own cache instance — there's no cross-type pollution.
/// </summary>
/// <remarks>
/// <para>
/// Concurrent-safe: <see cref="ConcurrentDictionary{TKey, TValue}"/> of
/// <see cref="Lazy{T}"/> means a miss on two threads compiles exactly once.
/// <see cref="LazyThreadSafetyMode.ExecutionAndPublication"/> guarantees other
/// threads block on the in-flight compile rather than racing to duplicate work.
/// </para>
/// <para>
/// Cache is keyed on SHA-256 of the UTF-8 bytes of the source — collision-free in
/// practice. Whitespace changes therefore miss the cache on purpose; operators
/// see re-compile time on their first evaluation after a format-only edit which
/// is rare and benign.
/// </para>
/// <para>
/// No capacity bound. Virtual-tag + alarm scripts are operator-authored and
/// bounded by config DB (typically low thousands). If that changes in v3, add an
/// LRU eviction policy — the API stays the same.
/// </para>
/// </remarks>
public sealed class CompiledScriptCache<TContext, TResult>
where TContext : ScriptContext
{
private readonly ConcurrentDictionary<string, Lazy<ScriptEvaluator<TContext, TResult>>> _cache = new();
/// <summary>
/// Return the compiled evaluator for <paramref name="scriptSource"/>, compiling
/// on first sight + reusing thereafter. If the source fails to compile, the
/// original Roslyn / sandbox exception propagates; the cache entry is removed so
/// the next call retries (useful during Admin UI authoring when the operator is
/// still fixing syntax).
/// </summary>
public ScriptEvaluator<TContext, TResult> GetOrCompile(string scriptSource)
{
if (scriptSource is null) throw new ArgumentNullException(nameof(scriptSource));
var key = HashSource(scriptSource);
var lazy = _cache.GetOrAdd(key, _ => new Lazy<ScriptEvaluator<TContext, TResult>>(
() => ScriptEvaluator<TContext, TResult>.Compile(scriptSource),
LazyThreadSafetyMode.ExecutionAndPublication));
try
{
return lazy.Value;
}
catch
{
// Failed compile — evict so a retry with corrected source can succeed.
_cache.TryRemove(key, out _);
throw;
}
}
/// <summary>Current entry count. Exposed for Admin UI diagnostics / tests.</summary>
public int Count => _cache.Count;
/// <summary>Drop every cached compile. Used on config generation publish + tests.</summary>
public void Clear() => _cache.Clear();
/// <summary>True when the exact source has been compiled at least once + is still cached.</summary>
public bool Contains(string scriptSource)
=> _cache.ContainsKey(HashSource(scriptSource));
private static string HashSource(string source)
{
var bytes = Encoding.UTF8.GetBytes(source);
var hash = SHA256.HashData(bytes);
return Convert.ToHexString(hash);
}
}

View File

@@ -0,0 +1,102 @@
namespace ZB.MOM.WW.OtOpcUa.Core.Scripting;
/// <summary>
/// Wraps a <see cref="ScriptEvaluator{TContext, TResult}"/> with a per-evaluation
/// wall-clock timeout. Default is 250ms per Phase 7 plan Stream A.4; configurable
/// per tag so deployments with slower backends can widen it.
/// </summary>
/// <remarks>
/// <para>
/// Implemented with <see cref="Task.WaitAsync(TimeSpan, CancellationToken)"/>
/// rather than a cancellation-token-only approach because Roslyn-compiled
/// scripts don't internally poll the cancellation token unless the user code
/// does async work. A CPU-bound infinite loop in a script won't honor a
/// cooperative cancel — <c>WaitAsync</c> returns control when the timeout fires
/// regardless of whether the inner task completes.
/// </para>
/// <para>
/// <b>Known limitation:</b> when a script times out, the underlying ScriptRunner
/// task continues running on a thread-pool thread until the Roslyn runtime
/// returns. In the CPU-bound-infinite-loop case that's effectively "leaked" —
/// the thread is tied up until the runtime decides to return, which it may
/// never do. Phase 7 plan Stream A.4 accepts this as a known trade-off; tighter
/// CPU budgeting would require an out-of-process script runner, which is a v3
/// concern. In practice, the timeout + structured warning log surfaces the
/// offending script so the operator can fix it; the orphan thread is rare.
/// </para>
/// <para>
/// Caller-supplied <see cref="CancellationToken"/> is honored — if the caller
/// cancels before the timeout fires, the caller's cancel wins and the
/// <see cref="OperationCanceledException"/> propagates (not wrapped as
/// <see cref="ScriptTimeoutException"/>). That distinction matters: the
/// virtual-tag engine's shutdown path cancels scripts on dispose; it shouldn't
/// see those as timeouts.
/// </para>
/// </remarks>
public sealed class TimedScriptEvaluator<TContext, TResult>
where TContext : ScriptContext
{
/// <summary>Default timeout per Phase 7 plan Stream A.4 — 250ms.</summary>
public static readonly TimeSpan DefaultTimeout = TimeSpan.FromMilliseconds(250);
private readonly ScriptEvaluator<TContext, TResult> _inner;
/// <summary>Wall-clock budget per evaluation. Script exceeding this throws <see cref="ScriptTimeoutException"/>.</summary>
public TimeSpan Timeout { get; }
public TimedScriptEvaluator(ScriptEvaluator<TContext, TResult> inner)
: this(inner, DefaultTimeout)
{
}
public TimedScriptEvaluator(ScriptEvaluator<TContext, TResult> inner, TimeSpan timeout)
{
_inner = inner ?? throw new ArgumentNullException(nameof(inner));
if (timeout <= TimeSpan.Zero)
throw new ArgumentOutOfRangeException(nameof(timeout), "Timeout must be positive.");
Timeout = timeout;
}
public async Task<TResult> RunAsync(TContext context, CancellationToken ct = default)
{
if (context is null) throw new ArgumentNullException(nameof(context));
// Push evaluation to a thread-pool thread so a CPU-bound script (e.g. a tight
// loop with no async work) doesn't hog the caller's thread before WaitAsync
// gets to register its timeout. Without this, Roslyn's ScriptRunner executes
// synchronously on the calling thread and returns an already-completed Task,
// so WaitAsync sees a completed task and never fires the timeout.
var runTask = Task.Run(() => _inner.RunAsync(context, ct), ct);
try
{
return await runTask.WaitAsync(Timeout, ct).ConfigureAwait(false);
}
catch (TimeoutException)
{
// WaitAsync's synthesized timeout — the inner task may still be running
// on its thread-pool thread (known leak documented in the class summary).
// Wrap so callers can distinguish from user-written timeout logic.
throw new ScriptTimeoutException(Timeout);
}
}
}
/// <summary>
/// Thrown when a script evaluation exceeds its configured timeout. The virtual-tag
/// engine (Stream B) catches this + maps the owning tag's quality to
/// <c>BadInternalError</c> per Phase 7 plan decision #11, logging a structured
/// warning with the offending script name so operators can locate + fix it.
/// </summary>
public sealed class ScriptTimeoutException : Exception
{
public TimeSpan Timeout { get; }
public ScriptTimeoutException(TimeSpan timeout)
: base($"Script evaluation exceeded the configured timeout of {timeout.TotalMilliseconds:F1} ms. " +
"The script was either CPU-bound or blocked on a slow operation; check ctx.Logger output " +
"around the timeout and consider widening the timeout per tag, simplifying the script, or " +
"moving heavy work out of the evaluation path.")
{
Timeout = timeout;
}
}

View File

@@ -0,0 +1,151 @@
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Core.Scripting;
namespace ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests;
/// <summary>
/// Exercises the source-hash keyed compile cache. Roslyn compilation is the most
/// expensive step in the evaluator pipeline; this cache collapses redundant
/// compiles of unchanged scripts to zero-cost lookups + makes sure concurrent
/// callers never double-compile.
/// </summary>
[Trait("Category", "Unit")]
public sealed class CompiledScriptCacheTests
{
private sealed class CompileCountingGate
{
public int Count;
}
[Fact]
public void First_call_compiles_and_caches()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
cache.Count.ShouldBe(0);
var e = cache.GetOrCompile("""return 42;""");
e.ShouldNotBeNull();
cache.Count.ShouldBe(1);
cache.Contains("""return 42;""").ShouldBeTrue();
}
[Fact]
public void Identical_source_returns_the_same_compiled_evaluator()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
var first = cache.GetOrCompile("""return 1;""");
var second = cache.GetOrCompile("""return 1;""");
ReferenceEquals(first, second).ShouldBeTrue();
cache.Count.ShouldBe(1);
}
[Fact]
public void Different_source_produces_different_evaluator()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
var a = cache.GetOrCompile("""return 1;""");
var b = cache.GetOrCompile("""return 2;""");
ReferenceEquals(a, b).ShouldBeFalse();
cache.Count.ShouldBe(2);
}
[Fact]
public void Whitespace_difference_misses_cache()
{
// Documented behavior: reformatting a script recompiles. Simpler + cheaper
// than the alternative (AST-canonicalize then hash) and doesn't happen often.
var cache = new CompiledScriptCache<FakeScriptContext, int>();
cache.GetOrCompile("""return 1;""");
cache.GetOrCompile("return 1; "); // trailing whitespace — different hash
cache.Count.ShouldBe(2);
}
[Fact]
public async Task Cached_evaluator_still_runs_correctly()
{
var cache = new CompiledScriptCache<FakeScriptContext, double>();
var e = cache.GetOrCompile("""return (double)ctx.GetTag("In").Value * 3.0;""");
var ctx = new FakeScriptContext().Seed("In", 7.0);
// Run twice through the cache — both must return the same correct value.
var first = await e.RunAsync(ctx, TestContext.Current.CancellationToken);
var second = await cache.GetOrCompile("""return (double)ctx.GetTag("In").Value * 3.0;""")
.RunAsync(ctx, TestContext.Current.CancellationToken);
first.ShouldBe(21.0);
second.ShouldBe(21.0);
}
[Fact]
public void Failed_compile_is_evicted_so_retry_with_corrected_source_works()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
// First attempt — undefined identifier, compile throws.
Should.Throw<Exception>(() => cache.GetOrCompile("""return unknownIdentifier + 1;"""));
cache.Count.ShouldBe(0, "failed compile must be evicted so retry can re-attempt");
// Retry with corrected source succeeds + caches.
cache.GetOrCompile("""return 42;""").ShouldNotBeNull();
cache.Count.ShouldBe(1);
}
[Fact]
public void Clear_drops_every_entry()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
cache.GetOrCompile("""return 1;""");
cache.GetOrCompile("""return 2;""");
cache.Count.ShouldBe(2);
cache.Clear();
cache.Count.ShouldBe(0);
cache.Contains("""return 1;""").ShouldBeFalse();
}
[Fact]
public void Concurrent_compiles_of_the_same_source_deduplicate()
{
// LazyThreadSafetyMode.ExecutionAndPublication guarantees only one compile
// even when multiple threads race GetOrCompile against an empty cache.
// We can't directly count Roslyn compilations — but we can assert all
// concurrent callers see the same evaluator instance.
var cache = new CompiledScriptCache<FakeScriptContext, int>();
const string src = """return 99;""";
var tasks = Enumerable.Range(0, 20)
.Select(_ => Task.Run(() => cache.GetOrCompile(src)))
.ToArray();
Task.WhenAll(tasks).GetAwaiter().GetResult();
var firstInstance = tasks[0].Result;
foreach (var t in tasks)
ReferenceEquals(t.Result, firstInstance).ShouldBeTrue();
cache.Count.ShouldBe(1);
}
[Fact]
public void Different_TContext_TResult_pairs_use_separate_cache_instances()
{
// Documented: each engine (virtual-tag / alarm-predicate / alarm-action) owns
// its own cache. The type-parametric design makes this the default without
// cross-contamination at the dictionary level.
var intCache = new CompiledScriptCache<FakeScriptContext, int>();
var boolCache = new CompiledScriptCache<FakeScriptContext, bool>();
intCache.GetOrCompile("""return 1;""");
boolCache.GetOrCompile("""return true;""");
intCache.Count.ShouldBe(1);
boolCache.Count.ShouldBe(1);
intCache.Contains("""return true;""").ShouldBeFalse();
boolCache.Contains("""return 1;""").ShouldBeFalse();
}
[Fact]
public void Null_source_throws_ArgumentNullException()
{
var cache = new CompiledScriptCache<FakeScriptContext, int>();
Should.Throw<ArgumentNullException>(() => cache.GetOrCompile(null!));
}
}

View File

@@ -0,0 +1,134 @@
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Core.Scripting;
namespace ZB.MOM.WW.OtOpcUa.Core.Scripting.Tests;
/// <summary>
/// Verifies the per-evaluation timeout wrapper. Fast scripts complete normally;
/// CPU-bound or hung scripts throw <see cref="ScriptTimeoutException"/> instead of
/// starving the engine. Caller-supplied cancellation tokens take precedence over the
/// timeout so driver-shutdown paths see a clean cancel rather than a timeout.
/// </summary>
[Trait("Category", "Unit")]
public sealed class TimedScriptEvaluatorTests
{
[Fact]
public async Task Fast_script_completes_under_timeout_and_returns_value()
{
var inner = ScriptEvaluator<FakeScriptContext, double>.Compile(
"""return (double)ctx.GetTag("In").Value + 1.0;""");
var timed = new TimedScriptEvaluator<FakeScriptContext, double>(
inner, TimeSpan.FromSeconds(1));
var ctx = new FakeScriptContext().Seed("In", 41.0);
var result = await timed.RunAsync(ctx, TestContext.Current.CancellationToken);
result.ShouldBe(42.0);
}
[Fact]
public async Task Script_longer_than_timeout_throws_ScriptTimeoutException()
{
// Scripts can't easily do Thread.Sleep in the sandbox (System.Threading.Thread
// is denied). But a tight CPU loop exceeds any short timeout.
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile(
"""
var end = Environment.TickCount64 + 5000;
while (Environment.TickCount64 < end) { }
return 1;
""");
var timed = new TimedScriptEvaluator<FakeScriptContext, int>(
inner, TimeSpan.FromMilliseconds(50));
var ex = await Should.ThrowAsync<ScriptTimeoutException>(async () =>
await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken));
ex.Timeout.ShouldBe(TimeSpan.FromMilliseconds(50));
ex.Message.ShouldContain("50.0");
}
[Fact]
public async Task Caller_cancellation_takes_precedence_over_timeout()
{
// A CPU-bound script that would otherwise timeout; external ct fires first.
// Expected: OperationCanceledException (not ScriptTimeoutException) so shutdown
// paths aren't misclassified as timeouts.
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile(
"""
var end = Environment.TickCount64 + 10000;
while (Environment.TickCount64 < end) { }
return 1;
""");
var timed = new TimedScriptEvaluator<FakeScriptContext, int>(
inner, TimeSpan.FromSeconds(5));
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(80));
await Should.ThrowAsync<OperationCanceledException>(async () =>
await timed.RunAsync(new FakeScriptContext(), cts.Token));
}
[Fact]
public void Default_timeout_is_250ms_per_plan()
{
TimedScriptEvaluator<FakeScriptContext, int>.DefaultTimeout
.ShouldBe(TimeSpan.FromMilliseconds(250));
}
[Fact]
public void Zero_or_negative_timeout_is_rejected_at_construction()
{
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile("""return 1;""");
Should.Throw<ArgumentOutOfRangeException>(() =>
new TimedScriptEvaluator<FakeScriptContext, int>(inner, TimeSpan.Zero));
Should.Throw<ArgumentOutOfRangeException>(() =>
new TimedScriptEvaluator<FakeScriptContext, int>(inner, TimeSpan.FromMilliseconds(-1)));
}
[Fact]
public void Null_inner_is_rejected()
{
Should.Throw<ArgumentNullException>(() =>
new TimedScriptEvaluator<FakeScriptContext, int>(null!));
}
[Fact]
public void Null_context_is_rejected()
{
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile("""return 1;""");
var timed = new TimedScriptEvaluator<FakeScriptContext, int>(inner);
Should.ThrowAsync<ArgumentNullException>(async () =>
await timed.RunAsync(null!, TestContext.Current.CancellationToken));
}
[Fact]
public async Task Script_exception_propagates_unwrapped()
{
// User-thrown exceptions must come through as-is — NOT wrapped in
// ScriptTimeoutException. The virtual-tag engine catches them per-tag and
// maps to BadInternalError; conflating with timeout would lose that info.
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile(
"""throw new InvalidOperationException("script boom");""");
var timed = new TimedScriptEvaluator<FakeScriptContext, int>(inner, TimeSpan.FromSeconds(1));
var ex = await Should.ThrowAsync<InvalidOperationException>(async () =>
await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken));
ex.Message.ShouldBe("script boom");
}
[Fact]
public async Task ScriptTimeoutException_message_points_at_diagnostic_path()
{
var inner = ScriptEvaluator<FakeScriptContext, int>.Compile(
"""
var end = Environment.TickCount64 + 5000;
while (Environment.TickCount64 < end) { }
return 1;
""");
var timed = new TimedScriptEvaluator<FakeScriptContext, int>(
inner, TimeSpan.FromMilliseconds(30));
var ex = await Should.ThrowAsync<ScriptTimeoutException>(async () =>
await timed.RunAsync(new FakeScriptContext(), TestContext.Current.CancellationToken));
ex.Message.ShouldContain("ctx.Logger");
ex.Message.ShouldContain("widening the timeout");
}
}