From ef6b0bb8fc8da097a4eb27ffa683fdde0f4dc9ed Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 07:37:43 -0400 Subject: [PATCH 1/2] =?UTF-8?q?Phase=206.1=20Stream=20B.1/B.2=20=E2=80=94?= =?UTF-8?q?=20DriverTier=20on=20DriverTypeMetadata=20+=20Core.Stability.Me?= =?UTF-8?q?moryTracking=20with=20hybrid-formula=20soft/hard=20thresholds?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stream B.1 — registry invariant: - DriverTypeMetadata gains a required `DriverTier Tier` field. Every registered driver type must declare its stability tier so the downstream MemoryTracking, MemoryRecycle, and resilience-policy layers can resolve the right defaults. Stamped-at-registration-time enforcement makes the "every driver type has a non-null Tier" compliance check structurally impossible to fail. - DriverTypeRegistry API unchanged; one new property on the record. Stream B.2 — MemoryTracking (Core.Stability): - Tier-agnostic tracker per decision #146: captures baseline as the median of samples collected during a post-init warmup window (default 5 min), then classifies each subsequent sample with the hybrid formula `soft = max(multiplier × baseline, baseline + floor)`, `hard = 2 × soft`. - Per-tier constants wired: Tier A mult=3 floor=50 MB, Tier B mult=3 floor=100 MB, Tier C mult=2 floor=500 MB. - Never kills. Hard-breach action returns HardBreach; the supervisor that acts on that signal (MemoryRecycle) is Tier C only per decisions #74, #145 and lands in the next B.3 commit on this branch. - Two phases: WarmingUp (samples collected, Warming returned) and Steady (baseline captured, soft/hard checks active). Transition is automatic when the warmup window elapses. Tests (15 new, all pass): - Warming phase returns Warming until the window elapses. - Window-elapsed captures median baseline + transitions to Steady. - Per-tier constants match decision #146 table exactly. - Soft threshold uses max() — small baseline → floor wins; large baseline → multiplier wins. - Hard = 2 × soft. - Sample below soft = None; at soft = SoftBreach; at/above hard = HardBreach. - DriverTypeRegistry: theory asserts Tier round-trips for A/B/C. Full solution dotnet test: 963 passing (baseline 906, +57 net for Phase 6.1 Stream A + Stream B.1/B.2). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../DriverTypeRegistry.cs | 10 +- .../Stability/MemoryTracking.cs | 136 ++++++++++++++++++ .../DriverTypeRegistryTests.cs | 20 ++- .../Stability/MemoryTrackingTests.cs | 119 +++++++++++++++ 4 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryTrackingTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTypeRegistry.cs b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTypeRegistry.cs index 6655886..42e9607 100644 --- a/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTypeRegistry.cs +++ b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/DriverTypeRegistry.cs @@ -69,12 +69,20 @@ public sealed class DriverTypeRegistry /// JSON Schema (Draft 2020-12) the driver's DriverConfig column must validate against. /// JSON Schema for DeviceConfig (multi-device drivers); null if the driver has no device layer. /// JSON Schema for TagConfig; required for every driver since every driver has tags. +/// +/// Stability tier per docs/v2/driver-stability.md §2-4 and docs/v2/plan.md +/// decisions #63-74. Drives the shared resilience pipeline defaults +/// ( × capability → CapabilityPolicy), the MemoryTracking +/// hybrid-formula constants, and whether process-level MemoryRecycle / scheduled- +/// recycle protections apply (Tier C only). Every registered driver type must declare one. +/// public sealed record DriverTypeMetadata( string TypeName, NamespaceKindCompatibility AllowedNamespaceKinds, string DriverConfigJsonSchema, string? DeviceConfigJsonSchema, - string TagConfigJsonSchema); + string TagConfigJsonSchema, + DriverTier Tier); /// Bitmask of namespace kinds a driver type may populate. Per decision #111. [Flags] diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs new file mode 100644 index 0000000..19dffa2 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryTracking.cs @@ -0,0 +1,136 @@ +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Core.Stability; + +/// +/// Tier-agnostic memory-footprint tracker. Captures the post-initialize baseline +/// from the first samples after IDriver.InitializeAsync, then classifies each +/// subsequent sample against a hybrid soft/hard threshold per +/// docs/v2/plan.md decision #146 — soft = max(multiplier × baseline, baseline + floor), +/// hard = 2 × soft. +/// +/// +/// Per decision #145, this tracker never kills a process. Soft and hard breaches +/// log + surface to the Admin UI via DriverInstanceResilienceStatus. The matching +/// process-level recycle protection lives in a separate MemoryRecycle that activates +/// for Tier C drivers only (where the driver runs out-of-process behind a supervisor that +/// can safely restart it without tearing down the OPC UA session or co-hosted in-proc +/// drivers). +/// +/// Baseline capture: the tracker starts in for +/// (default 5 min). During that window samples are collected; +/// the baseline is computed as the median once the window elapses. Before that point every +/// classification returns . +/// +public sealed class MemoryTracking +{ + private readonly DriverTier _tier; + private readonly TimeSpan _baselineWindow; + private readonly List _warmupSamples = []; + private long _baselineBytes; + private TrackingPhase _phase = TrackingPhase.WarmingUp; + private DateTime? _warmupStartUtc; + + /// Tier-default multiplier/floor constants per decision #146. + public static (int Multiplier, long FloorBytes) GetTierConstants(DriverTier tier) => tier switch + { + DriverTier.A => (Multiplier: 3, FloorBytes: 50L * 1024 * 1024), + DriverTier.B => (Multiplier: 3, FloorBytes: 100L * 1024 * 1024), + DriverTier.C => (Multiplier: 2, FloorBytes: 500L * 1024 * 1024), + _ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No memory-tracking constants defined for tier {tier}."), + }; + + /// Window over which post-init samples are collected to compute the baseline. + public TimeSpan BaselineWindow => _baselineWindow; + + /// Current phase: or . + public TrackingPhase Phase => _phase; + + /// Captured baseline; 0 until warmup completes. + public long BaselineBytes => _baselineBytes; + + /// Effective soft threshold (zero while warming up). + public long SoftThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes); + + /// Effective hard threshold = 2 × soft (zero while warming up). + public long HardThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes) * 2; + + public MemoryTracking(DriverTier tier, TimeSpan? baselineWindow = null) + { + _tier = tier; + _baselineWindow = baselineWindow ?? TimeSpan.FromMinutes(5); + } + + /// + /// Submit a memory-footprint sample. Returns the action the caller should surface. + /// During warmup, always returns and accumulates + /// samples; once the window elapses the first steady-phase sample triggers baseline capture + /// (median of warmup samples). + /// + public MemoryTrackingAction Sample(long footprintBytes, DateTime utcNow) + { + if (_phase == TrackingPhase.WarmingUp) + { + _warmupStartUtc ??= utcNow; + _warmupSamples.Add(footprintBytes); + if (utcNow - _warmupStartUtc.Value >= _baselineWindow && _warmupSamples.Count > 0) + { + _baselineBytes = ComputeMedian(_warmupSamples); + _phase = TrackingPhase.Steady; + } + else + { + return MemoryTrackingAction.Warming; + } + } + + if (footprintBytes >= HardThresholdBytes) return MemoryTrackingAction.HardBreach; + if (footprintBytes >= SoftThresholdBytes) return MemoryTrackingAction.SoftBreach; + return MemoryTrackingAction.None; + } + + private static long ComputeSoft(DriverTier tier, long baseline) + { + var (multiplier, floor) = GetTierConstants(tier); + return Math.Max(multiplier * baseline, baseline + floor); + } + + private static long ComputeMedian(List samples) + { + var sorted = samples.Order().ToArray(); + var mid = sorted.Length / 2; + return sorted.Length % 2 == 1 + ? sorted[mid] + : (sorted[mid - 1] + sorted[mid]) / 2; + } +} + +/// Phase of a lifecycle. +public enum TrackingPhase +{ + /// Collecting post-init samples; baseline not yet computed. + WarmingUp, + + /// Baseline captured; every sample classified against soft/hard thresholds. + Steady, +} + +/// Classification the tracker returns per sample. +public enum MemoryTrackingAction +{ + /// Baseline not yet captured; sample collected, no threshold check. + Warming, + + /// Below soft threshold. + None, + + /// Between soft and hard thresholds — log + surface, no action. + SoftBreach, + + /// + /// ≥ hard threshold. Log + surface + (Tier C only, via MemoryRecycle) request + /// process recycle via the driver supervisor. Tier A/B breach never invokes any + /// kill path per decisions #145 and #74. + /// + HardBreach, +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Abstractions.Tests/DriverTypeRegistryTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Abstractions.Tests/DriverTypeRegistryTests.cs index 2a50d9d..372815f 100644 --- a/tests/ZB.MOM.WW.OtOpcUa.Core.Abstractions.Tests/DriverTypeRegistryTests.cs +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Abstractions.Tests/DriverTypeRegistryTests.cs @@ -7,11 +7,13 @@ public sealed class DriverTypeRegistryTests { private static DriverTypeMetadata SampleMetadata( string typeName = "Modbus", - NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment) => + NamespaceKindCompatibility allowed = NamespaceKindCompatibility.Equipment, + DriverTier tier = DriverTier.B) => new(typeName, allowed, DriverConfigJsonSchema: "{\"type\": \"object\"}", DeviceConfigJsonSchema: "{\"type\": \"object\"}", - TagConfigJsonSchema: "{\"type\": \"object\"}"); + TagConfigJsonSchema: "{\"type\": \"object\"}", + Tier: tier); [Fact] public void Register_ThenGet_RoundTrips() @@ -24,6 +26,20 @@ public sealed class DriverTypeRegistryTests registry.Get("Modbus").ShouldBe(metadata); } + [Theory] + [InlineData(DriverTier.A)] + [InlineData(DriverTier.B)] + [InlineData(DriverTier.C)] + public void Register_Requires_NonNullTier(DriverTier tier) + { + var registry = new DriverTypeRegistry(); + var metadata = SampleMetadata(typeName: $"Driver-{tier}", tier: tier); + + registry.Register(metadata); + + registry.Get(metadata.TypeName).Tier.ShouldBe(tier); + } + [Fact] public void Get_IsCaseInsensitive() { diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryTrackingTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryTrackingTests.cs new file mode 100644 index 0000000..afd27e2 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryTrackingTests.cs @@ -0,0 +1,119 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Core.Stability; + +namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability; + +[Trait("Category", "Unit")] +public sealed class MemoryTrackingTests +{ + private static readonly DateTime T0 = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc); + + [Fact] + public void WarmingUp_Returns_Warming_UntilWindowElapses() + { + var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5)); + + tracker.Sample(100_000_000, T0).ShouldBe(MemoryTrackingAction.Warming); + tracker.Sample(105_000_000, T0.AddMinutes(1)).ShouldBe(MemoryTrackingAction.Warming); + tracker.Sample(102_000_000, T0.AddMinutes(4.9)).ShouldBe(MemoryTrackingAction.Warming); + + tracker.Phase.ShouldBe(TrackingPhase.WarmingUp); + tracker.BaselineBytes.ShouldBe(0); + } + + [Fact] + public void WindowElapsed_CapturesBaselineAsMedian_AndTransitionsToSteady() + { + var tracker = new MemoryTracking(DriverTier.A, TimeSpan.FromMinutes(5)); + + tracker.Sample(100_000_000, T0); + tracker.Sample(200_000_000, T0.AddMinutes(1)); + tracker.Sample(150_000_000, T0.AddMinutes(2)); + var first = tracker.Sample(150_000_000, T0.AddMinutes(5)); + + tracker.Phase.ShouldBe(TrackingPhase.Steady); + tracker.BaselineBytes.ShouldBe(150_000_000L, "median of 4 samples [100, 200, 150, 150] = (150+150)/2 = 150"); + first.ShouldBe(MemoryTrackingAction.None, "150 MB is the baseline itself, well under soft threshold"); + } + + [Theory] + [InlineData(DriverTier.A, 3, 50)] + [InlineData(DriverTier.B, 3, 100)] + [InlineData(DriverTier.C, 2, 500)] + public void GetTierConstants_MatchesDecision146(DriverTier tier, int expectedMultiplier, long expectedFloorMB) + { + var (multiplier, floor) = MemoryTracking.GetTierConstants(tier); + multiplier.ShouldBe(expectedMultiplier); + floor.ShouldBe(expectedFloorMB * 1024 * 1024); + } + + [Fact] + public void SoftThreshold_UsesMax_OfMultiplierAndFloor_SmallBaseline() + { + // Tier A: mult=3, floor=50 MB. Baseline 10 MB → 3×10=30 MB < 10+50=60 MB → floor wins. + var tracker = WarmupWithBaseline(DriverTier.A, 10L * 1024 * 1024); + tracker.SoftThresholdBytes.ShouldBe(60L * 1024 * 1024); + } + + [Fact] + public void SoftThreshold_UsesMax_OfMultiplierAndFloor_LargeBaseline() + { + // Tier A: mult=3, floor=50 MB. Baseline 200 MB → 3×200=600 MB > 200+50=250 MB → multiplier wins. + var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024); + tracker.SoftThresholdBytes.ShouldBe(600L * 1024 * 1024); + } + + [Fact] + public void HardThreshold_IsTwiceSoft() + { + var tracker = WarmupWithBaseline(DriverTier.B, 200L * 1024 * 1024); + tracker.HardThresholdBytes.ShouldBe(tracker.SoftThresholdBytes * 2); + } + + [Fact] + public void Sample_Below_Soft_Returns_None() + { + var tracker = WarmupWithBaseline(DriverTier.A, 100L * 1024 * 1024); + + tracker.Sample(200L * 1024 * 1024, T0.AddMinutes(10)).ShouldBe(MemoryTrackingAction.None); + } + + [Fact] + public void Sample_AtSoft_Returns_SoftBreach() + { + // Tier A, baseline 200 MB → soft = 600 MB. Sample exactly at soft. + var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024); + + tracker.Sample(tracker.SoftThresholdBytes, T0.AddMinutes(10)) + .ShouldBe(MemoryTrackingAction.SoftBreach); + } + + [Fact] + public void Sample_AtHard_Returns_HardBreach() + { + var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024); + + tracker.Sample(tracker.HardThresholdBytes, T0.AddMinutes(10)) + .ShouldBe(MemoryTrackingAction.HardBreach); + } + + [Fact] + public void Sample_AboveHard_Returns_HardBreach() + { + var tracker = WarmupWithBaseline(DriverTier.A, 200L * 1024 * 1024); + + tracker.Sample(tracker.HardThresholdBytes + 100_000_000, T0.AddMinutes(10)) + .ShouldBe(MemoryTrackingAction.HardBreach); + } + + private static MemoryTracking WarmupWithBaseline(DriverTier tier, long baseline) + { + var tracker = new MemoryTracking(tier, TimeSpan.FromMinutes(5)); + tracker.Sample(baseline, T0); + tracker.Sample(baseline, T0.AddMinutes(5)); + tracker.BaselineBytes.ShouldBe(baseline); + return tracker; + } +} -- 2.49.1 From 1d9008e3541fe7ce20fcb45b46195bb5598fbb7c Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 19 Apr 2026 08:03:18 -0400 Subject: [PATCH 2/2] =?UTF-8?q?Phase=206.1=20Stream=20B.3/B.4/B.5=20?= =?UTF-8?q?=E2=80=94=20MemoryRecycle=20+=20ScheduledRecycleScheduler=20+?= =?UTF-8?q?=20demand-aware=20WedgeDetector?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes out Stream B per docs/v2/implementation/phase-6-1-resilience-and-observability.md. Core.Abstractions: - IDriverSupervisor — process-level supervisor contract a Tier C driver's out-of-process topology provides (Galaxy Proxy/Supervisor implements this in a follow-up Driver.Galaxy wiring PR). Concerns: DriverInstanceId + RecycleAsync. Tier A/B drivers don't implement this; Stream B code asserts tier == C before ever calling it. Core.Stability: - MemoryRecycle — companion to MemoryTracking. On HardBreach, invokes the supervisor IFF tier == C AND a supervisor is wired. Tier A/B HardBreach logs a promotion-to-Tier-C recommendation and returns false. Soft/None/Warming never triggers a recycle at any tier. - ScheduledRecycleScheduler — Tier C opt-in periodic recycler per decision #67. Ctor throws for Tier A/B (structural guard — scheduled recycle on an in-process driver would kill every OPC UA session and every co-hosted driver). TickAsync(now) advances the schedule by one interval per fire; RequestRecycleNowAsync drives an ad-hoc recycle without shifting the cron. - WedgeDetector — demand-aware per decision #147. Classify(state, demand, now) returns: * NotApplicable when driver state != Healthy * Idle when Healthy + no pending work (bulkhead=0 && monitored=0 && historic=0) * Healthy when Healthy + pending work + progress within threshold * Faulted when Healthy + pending work + no progress within threshold Threshold clamps to min 60 s. DemandSignal.HasPendingWork ORs the three counters. The three false-wedge cases the plan calls out all stay Healthy: idle subscription-only, slow historian backfill making progress, write-only burst with drained bulkhead. Tests (22 new, all pass): - MemoryRecycleTests (7): Tier C hard-breach requests recycle; Tier A/B hard-breach never requests; Tier C without supervisor no-ops; soft-breach at every tier never requests; None/Warming never request. - ScheduledRecycleSchedulerTests (6): ctor throws for A/B; zero/negative interval throws; tick before due no-ops; tick at/after due fires once and advances; RequestRecycleNow fires immediately without shifting schedule; multiple fires across ticks advance one interval each. - WedgeDetectorTests (9): threshold clamp to 60 s; unhealthy driver always NotApplicable; idle subscription stays Idle; pending+fresh progress stays Healthy; pending+stale progress is Faulted; MonitoredItems active but no publish is Faulted; MonitoredItems active with fresh publish stays Healthy; historian backfill with fresh progress stays Healthy; write-only burst with empty bulkhead is Idle; HasPendingWork theory for any non-zero counter. Full solution dotnet test: 989 passing (baseline 906, +83 for Phase 6.1 so far). Pre-existing Client.CLI Subscribe flake unchanged. Stream B complete. Next up: Stream C (health endpoints + structured logging). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../IDriverSupervisor.cs | 26 ++++ .../Stability/MemoryRecycle.cs | 65 ++++++++++ .../Stability/ScheduledRecycleScheduler.cs | 86 ++++++++++++++ .../Stability/WedgeDetector.cs | 81 +++++++++++++ .../Stability/MemoryRecycleTests.cs | 91 ++++++++++++++ .../ScheduledRecycleSchedulerTests.cs | 101 ++++++++++++++++ .../Stability/WedgeDetectorTests.cs | 112 ++++++++++++++++++ 7 files changed, 562 insertions(+) create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core/Stability/ScheduledRecycleScheduler.cs create mode 100644 src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryRecycleTests.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/ScheduledRecycleSchedulerTests.cs create mode 100644 tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/WedgeDetectorTests.cs diff --git a/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs new file mode 100644 index 0000000..0e07271 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core.Abstractions/IDriverSupervisor.cs @@ -0,0 +1,26 @@ +namespace ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +/// +/// Process-level supervisor contract a Tier C driver's out-of-process topology provides +/// (e.g. Driver.Galaxy.Proxy/Supervisor/). Concerns: restart the Host process when a +/// hard fault is detected (memory breach, wedge, scheduled recycle window). +/// +/// +/// Per docs/v2/plan.md decisions #68, #73-74, and #145. Tier A/B drivers do NOT have +/// a supervisor because they run in-process — recycling would kill every OPC UA session and +/// every co-hosted driver. The Core.Stability layer only invokes this interface for Tier C +/// instances after asserting the tier via . +/// +public interface IDriverSupervisor +{ + /// Driver instance this supervisor governs. + string DriverInstanceId { get; } + + /// + /// Request the supervisor to recycle (terminate + restart) the Host process. Implementations + /// are expected to be idempotent under repeat calls during an in-flight recycle. + /// + /// Human-readable reason — flows into the supervisor's logs. + /// Cancels the recycle request; an in-flight restart is not interrupted. + Task RecycleAsync(string reason, CancellationToken cancellationToken); +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs new file mode 100644 index 0000000..34f97d7 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/MemoryRecycle.cs @@ -0,0 +1,65 @@ +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Core.Stability; + +/// +/// Tier C only process-recycle companion to . On a +/// signal, invokes the supplied +/// to restart the out-of-process Host. +/// +/// +/// Per docs/v2/plan.md decisions #74 and #145. Tier A/B hard-breach on an in-process +/// driver would kill every OPC UA session and every co-hosted driver, so for Tier A/B this +/// class logs a promotion-to-Tier-C recommendation and does NOT invoke any supervisor. +/// A future tier-migration workflow acts on the recommendation. +/// +public sealed class MemoryRecycle +{ + private readonly DriverTier _tier; + private readonly IDriverSupervisor? _supervisor; + private readonly ILogger _logger; + + public MemoryRecycle(DriverTier tier, IDriverSupervisor? supervisor, ILogger logger) + { + _tier = tier; + _supervisor = supervisor; + _logger = logger; + } + + /// + /// Handle a classification for the driver. For Tier C with a + /// wired supervisor, HardBreach triggers . + /// All other combinations are no-ops with respect to process state (soft breaches + Tier A/B + /// hard breaches just log). + /// + /// True when a recycle was requested; false otherwise. + public async Task HandleAsync(MemoryTrackingAction action, long footprintBytes, CancellationToken cancellationToken) + { + switch (action) + { + case MemoryTrackingAction.SoftBreach: + _logger.LogWarning( + "Memory soft-breach on driver {DriverId}: footprint={Footprint:N0} bytes, tier={Tier}. Surfaced to Admin; no action.", + _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes, _tier); + return false; + + case MemoryTrackingAction.HardBreach when _tier == DriverTier.C && _supervisor is not null: + _logger.LogError( + "Memory hard-breach on Tier C driver {DriverId}: footprint={Footprint:N0} bytes. Requesting supervisor recycle.", + _supervisor.DriverInstanceId, footprintBytes); + await _supervisor.RecycleAsync($"Memory hard-breach: {footprintBytes} bytes", cancellationToken).ConfigureAwait(false); + return true; + + case MemoryTrackingAction.HardBreach: + _logger.LogError( + "Memory hard-breach on Tier {Tier} in-process driver {DriverId}: footprint={Footprint:N0} bytes. " + + "Recommending promotion to Tier C; NOT auto-killing (decisions #74, #145).", + _tier, _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes); + return false; + + default: + return false; + } + } +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Stability/ScheduledRecycleScheduler.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/ScheduledRecycleScheduler.cs new file mode 100644 index 0000000..84174f7 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/ScheduledRecycleScheduler.cs @@ -0,0 +1,86 @@ +using Microsoft.Extensions.Logging; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Core.Stability; + +/// +/// Tier C opt-in periodic-recycle driver per docs/v2/plan.md decision #67. +/// A tick method advanced by the caller (fed by a background timer in prod; by test clock +/// in unit tests) decides whether the configured interval has elapsed and, if so, drives the +/// supplied to recycle the Host. +/// +/// +/// Tier A/B drivers MUST NOT use this class — scheduled recycle for in-process drivers would +/// kill every OPC UA session and every co-hosted driver. The ctor throws when constructed +/// with any tier other than C to make the misuse structurally impossible. +/// +/// Keeps no background thread of its own — callers invoke on +/// their ambient scheduler tick (Phase 6.1 Stream C's health-endpoint host runs one). That +/// decouples the unit under test from wall-clock time and thread-pool scheduling. +/// +public sealed class ScheduledRecycleScheduler +{ + private readonly TimeSpan _recycleInterval; + private readonly IDriverSupervisor _supervisor; + private readonly ILogger _logger; + private DateTime _nextRecycleUtc; + + /// + /// Construct the scheduler for a Tier C driver. Throws if isn't C. + /// + /// Driver tier; must be . + /// Interval between recycles (e.g. 7 days). + /// Anchor time; next recycle fires at + . + /// Supervisor that performs the actual recycle. + /// Diagnostic sink. + public ScheduledRecycleScheduler( + DriverTier tier, + TimeSpan recycleInterval, + DateTime startUtc, + IDriverSupervisor supervisor, + ILogger logger) + { + if (tier != DriverTier.C) + throw new ArgumentException( + $"ScheduledRecycleScheduler is Tier C only (got {tier}). " + + "In-process drivers must not use scheduled recycle; see decisions #74 and #145.", + nameof(tier)); + + if (recycleInterval <= TimeSpan.Zero) + throw new ArgumentException("RecycleInterval must be positive.", nameof(recycleInterval)); + + _recycleInterval = recycleInterval; + _supervisor = supervisor; + _logger = logger; + _nextRecycleUtc = startUtc + recycleInterval; + } + + /// Next scheduled recycle UTC. Advances by on each fire. + public DateTime NextRecycleUtc => _nextRecycleUtc; + + /// Recycle interval this scheduler was constructed with. + public TimeSpan RecycleInterval => _recycleInterval; + + /// + /// Tick the scheduler forward. If is past + /// , requests a recycle from the supervisor and advances + /// by exactly one interval. Returns true when a recycle fired. + /// + public async Task TickAsync(DateTime utcNow, CancellationToken cancellationToken) + { + if (utcNow < _nextRecycleUtc) + return false; + + _logger.LogInformation( + "Scheduled recycle due for Tier C driver {DriverId} at {Now:o}; advancing next to {Next:o}.", + _supervisor.DriverInstanceId, utcNow, _nextRecycleUtc + _recycleInterval); + + await _supervisor.RecycleAsync("Scheduled periodic recycle", cancellationToken).ConfigureAwait(false); + _nextRecycleUtc += _recycleInterval; + return true; + } + + /// Request an immediate recycle outside the schedule (e.g. MemoryRecycle hard-breach escalation). + public Task RequestRecycleNowAsync(string reason, CancellationToken cancellationToken) => + _supervisor.RecycleAsync(reason, cancellationToken); +} diff --git a/src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs new file mode 100644 index 0000000..f20ab55 --- /dev/null +++ b/src/ZB.MOM.WW.OtOpcUa.Core/Stability/WedgeDetector.cs @@ -0,0 +1,81 @@ +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; + +namespace ZB.MOM.WW.OtOpcUa.Core.Stability; + +/// +/// Demand-aware driver-wedge detector per docs/v2/plan.md decision #147. +/// Flips a driver to only when BOTH of the following hold: +/// (a) there is pending work outstanding, AND (b) no progress has been observed for longer +/// than . Idle drivers, write-only burst drivers, and subscription-only +/// drivers whose signals don't arrive regularly all stay Healthy. +/// +/// +/// Pending work signal is supplied by the caller via : +/// non-zero Polly bulkhead depth, ≥1 active MonitoredItem, or ≥1 queued historian read +/// each qualifies. The detector itself is state-light: all it remembers is the last +/// LastProgressUtc it saw and the last wedge verdict. No history buffer. +/// +/// Default threshold per plan: 5 × PublishingInterval, with a minimum of 60 s. +/// Concrete values are driver-agnostic and configured per-instance by the caller. +/// +public sealed class WedgeDetector +{ + /// Wedge-detection threshold; pass < 60 s and the detector clamps to 60 s. + public TimeSpan Threshold { get; } + + /// Whether the driver reported itself at construction. + public WedgeDetector(TimeSpan threshold) + { + Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold; + } + + /// + /// Classify the current state against the demand signal. Does not retain state across + /// calls — each call is self-contained; the caller owns the LastProgressUtc clock. + /// + public WedgeVerdict Classify(DriverState state, DemandSignal demand, DateTime utcNow) + { + if (state != DriverState.Healthy) + return WedgeVerdict.NotApplicable; + + if (!demand.HasPendingWork) + return WedgeVerdict.Idle; + + var sinceProgress = utcNow - demand.LastProgressUtc; + return sinceProgress > Threshold ? WedgeVerdict.Faulted : WedgeVerdict.Healthy; + } +} + +/// +/// Caller-supplied demand snapshot. All three counters are OR'd — any non-zero means work +/// is outstanding, which is the trigger for checking the clock. +/// +/// Polly bulkhead depth (in-flight capability calls). +/// Number of live OPC UA MonitoredItems bound to this driver. +/// Pending historian-read requests the driver owes the server. +/// Last time the driver reported a successful unit of work (read, subscribe-ack, publish). +public readonly record struct DemandSignal( + int BulkheadDepth, + int ActiveMonitoredItems, + int QueuedHistoryReads, + DateTime LastProgressUtc) +{ + /// True when any of the three counters is > 0. + public bool HasPendingWork => BulkheadDepth > 0 || ActiveMonitoredItems > 0 || QueuedHistoryReads > 0; +} + +/// Outcome of a single call. +public enum WedgeVerdict +{ + /// Driver wasn't Healthy to begin with — wedge detection doesn't apply. + NotApplicable, + + /// Driver claims Healthy + no pending work → stays Healthy. + Idle, + + /// Driver claims Healthy + has pending work + has made progress within the threshold → stays Healthy. + Healthy, + + /// Driver claims Healthy + has pending work + has NOT made progress within the threshold → wedged. + Faulted, +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryRecycleTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryRecycleTests.cs new file mode 100644 index 0000000..00dbf43 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/MemoryRecycleTests.cs @@ -0,0 +1,91 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Core.Stability; + +namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability; + +[Trait("Category", "Unit")] +public sealed class MemoryRecycleTests +{ + [Fact] + public async Task TierC_HardBreach_RequestsSupervisorRecycle() + { + var supervisor = new FakeSupervisor(); + var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger.Instance); + + var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None); + + requested.ShouldBeTrue(); + supervisor.RecycleCount.ShouldBe(1); + supervisor.LastReason.ShouldContain("hard-breach"); + } + + [Theory] + [InlineData(DriverTier.A)] + [InlineData(DriverTier.B)] + public async Task InProcessTier_HardBreach_NeverRequestsRecycle(DriverTier tier) + { + var supervisor = new FakeSupervisor(); + var recycle = new MemoryRecycle(tier, supervisor, NullLogger.Instance); + + var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None); + + requested.ShouldBeFalse("Tier A/B hard-breach logs a promotion recommendation only (decisions #74, #145)"); + supervisor.RecycleCount.ShouldBe(0); + } + + [Fact] + public async Task TierC_WithoutSupervisor_HardBreach_NoOp() + { + var recycle = new MemoryRecycle(DriverTier.C, supervisor: null, NullLogger.Instance); + + var requested = await recycle.HandleAsync(MemoryTrackingAction.HardBreach, 2_000_000_000, CancellationToken.None); + + requested.ShouldBeFalse("no supervisor → no recycle path; action logged only"); + } + + [Theory] + [InlineData(DriverTier.A)] + [InlineData(DriverTier.B)] + [InlineData(DriverTier.C)] + public async Task SoftBreach_NeverRequestsRecycle(DriverTier tier) + { + var supervisor = new FakeSupervisor(); + var recycle = new MemoryRecycle(tier, supervisor, NullLogger.Instance); + + var requested = await recycle.HandleAsync(MemoryTrackingAction.SoftBreach, 1_000_000_000, CancellationToken.None); + + requested.ShouldBeFalse("soft-breach is surface-only at every tier"); + supervisor.RecycleCount.ShouldBe(0); + } + + [Theory] + [InlineData(MemoryTrackingAction.None)] + [InlineData(MemoryTrackingAction.Warming)] + public async Task NonBreachActions_NoOp(MemoryTrackingAction action) + { + var supervisor = new FakeSupervisor(); + var recycle = new MemoryRecycle(DriverTier.C, supervisor, NullLogger.Instance); + + var requested = await recycle.HandleAsync(action, 100_000_000, CancellationToken.None); + + requested.ShouldBeFalse(); + supervisor.RecycleCount.ShouldBe(0); + } + + private sealed class FakeSupervisor : IDriverSupervisor + { + public string DriverInstanceId => "fake-tier-c"; + public int RecycleCount { get; private set; } + public string? LastReason { get; private set; } + + public Task RecycleAsync(string reason, CancellationToken cancellationToken) + { + RecycleCount++; + LastReason = reason; + return Task.CompletedTask; + } + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/ScheduledRecycleSchedulerTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/ScheduledRecycleSchedulerTests.cs new file mode 100644 index 0000000..7cb16f4 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/ScheduledRecycleSchedulerTests.cs @@ -0,0 +1,101 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Core.Stability; + +namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability; + +[Trait("Category", "Unit")] +public sealed class ScheduledRecycleSchedulerTests +{ + private static readonly DateTime T0 = new(2026, 4, 19, 0, 0, 0, DateTimeKind.Utc); + private static readonly TimeSpan Weekly = TimeSpan.FromDays(7); + + [Theory] + [InlineData(DriverTier.A)] + [InlineData(DriverTier.B)] + public void TierAOrB_Ctor_Throws(DriverTier tier) + { + var supervisor = new FakeSupervisor(); + Should.Throw(() => new ScheduledRecycleScheduler( + tier, Weekly, T0, supervisor, NullLogger.Instance)); + } + + [Fact] + public void ZeroOrNegativeInterval_Throws() + { + var supervisor = new FakeSupervisor(); + Should.Throw(() => new ScheduledRecycleScheduler( + DriverTier.C, TimeSpan.Zero, T0, supervisor, NullLogger.Instance)); + Should.Throw(() => new ScheduledRecycleScheduler( + DriverTier.C, TimeSpan.FromSeconds(-1), T0, supervisor, NullLogger.Instance)); + } + + [Fact] + public async Task Tick_BeforeNextRecycle_NoOp() + { + var supervisor = new FakeSupervisor(); + var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger.Instance); + + var fired = await sch.TickAsync(T0 + TimeSpan.FromDays(6), CancellationToken.None); + + fired.ShouldBeFalse(); + supervisor.RecycleCount.ShouldBe(0); + } + + [Fact] + public async Task Tick_AtOrAfterNextRecycle_FiresOnce_AndAdvances() + { + var supervisor = new FakeSupervisor(); + var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger.Instance); + + var fired = await sch.TickAsync(T0 + Weekly + TimeSpan.FromMinutes(1), CancellationToken.None); + + fired.ShouldBeTrue(); + supervisor.RecycleCount.ShouldBe(1); + sch.NextRecycleUtc.ShouldBe(T0 + Weekly + Weekly); + } + + [Fact] + public async Task RequestRecycleNow_Fires_Immediately_WithoutAdvancingSchedule() + { + var supervisor = new FakeSupervisor(); + var sch = new ScheduledRecycleScheduler(DriverTier.C, Weekly, T0, supervisor, NullLogger.Instance); + var nextBefore = sch.NextRecycleUtc; + + await sch.RequestRecycleNowAsync("memory hard-breach", CancellationToken.None); + + supervisor.RecycleCount.ShouldBe(1); + supervisor.LastReason.ShouldBe("memory hard-breach"); + sch.NextRecycleUtc.ShouldBe(nextBefore, "ad-hoc recycle doesn't shift the cron schedule"); + } + + [Fact] + public async Task MultipleFires_AcrossTicks_AdvanceOneIntervalEach() + { + var supervisor = new FakeSupervisor(); + var sch = new ScheduledRecycleScheduler(DriverTier.C, TimeSpan.FromDays(1), T0, supervisor, NullLogger.Instance); + + await sch.TickAsync(T0 + TimeSpan.FromDays(1) + TimeSpan.FromHours(1), CancellationToken.None); + await sch.TickAsync(T0 + TimeSpan.FromDays(2) + TimeSpan.FromHours(1), CancellationToken.None); + await sch.TickAsync(T0 + TimeSpan.FromDays(3) + TimeSpan.FromHours(1), CancellationToken.None); + + supervisor.RecycleCount.ShouldBe(3); + sch.NextRecycleUtc.ShouldBe(T0 + TimeSpan.FromDays(4)); + } + + private sealed class FakeSupervisor : IDriverSupervisor + { + public string DriverInstanceId => "tier-c-fake"; + public int RecycleCount { get; private set; } + public string? LastReason { get; private set; } + + public Task RecycleAsync(string reason, CancellationToken cancellationToken) + { + RecycleCount++; + LastReason = reason; + return Task.CompletedTask; + } + } +} diff --git a/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/WedgeDetectorTests.cs b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/WedgeDetectorTests.cs new file mode 100644 index 0000000..c0c2a79 --- /dev/null +++ b/tests/ZB.MOM.WW.OtOpcUa.Core.Tests/Stability/WedgeDetectorTests.cs @@ -0,0 +1,112 @@ +using Shouldly; +using Xunit; +using ZB.MOM.WW.OtOpcUa.Core.Abstractions; +using ZB.MOM.WW.OtOpcUa.Core.Stability; + +namespace ZB.MOM.WW.OtOpcUa.Core.Tests.Stability; + +[Trait("Category", "Unit")] +public sealed class WedgeDetectorTests +{ + private static readonly DateTime Now = new(2026, 4, 19, 12, 0, 0, DateTimeKind.Utc); + private static readonly TimeSpan Threshold = TimeSpan.FromSeconds(120); + + [Fact] + public void SubSixtySecondThreshold_ClampsToSixty() + { + var detector = new WedgeDetector(TimeSpan.FromSeconds(10)); + detector.Threshold.ShouldBe(TimeSpan.FromSeconds(60)); + } + + [Fact] + public void Unhealthy_Driver_AlwaysNotApplicable() + { + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 5, ActiveMonitoredItems: 10, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10)); + + detector.Classify(DriverState.Faulted, demand, Now).ShouldBe(WedgeVerdict.NotApplicable); + detector.Classify(DriverState.Degraded, demand, Now).ShouldBe(WedgeVerdict.NotApplicable); + detector.Classify(DriverState.Initializing, demand, Now).ShouldBe(WedgeVerdict.NotApplicable); + } + + [Fact] + public void Idle_Subscription_Only_StaysIdle() + { + // Idle driver: bulkhead 0, monitored items 0, no history reads queued. + // Even if LastProgressUtc is ancient, the verdict is Idle, not Faulted. + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(0, 0, 0, Now.AddHours(-12)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle); + } + + [Fact] + public void PendingWork_WithRecentProgress_StaysHealthy() + { + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-30)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy); + } + + [Fact] + public void PendingWork_WithStaleProgress_IsFaulted() + { + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 2, ActiveMonitoredItems: 0, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-5)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted); + } + + [Fact] + public void MonitoredItems_Active_ButNoRecentPublish_IsFaulted() + { + // Subscription-only driver with live MonitoredItems but no publish progress within threshold + // is a real wedge — this is the case the previous "no successful Read" formulation used + // to miss (no reads ever happen). + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddMinutes(-10)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Faulted); + } + + [Fact] + public void MonitoredItems_Active_WithFreshPublish_StaysHealthy() + { + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 5, QueuedHistoryReads: 0, LastProgressUtc: Now.AddSeconds(-10)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy); + } + + [Fact] + public void HistoryBackfill_SlowButMakingProgress_StaysHealthy() + { + // Slow historian backfill — QueuedHistoryReads > 0 but progress advances within threshold. + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(BulkheadDepth: 0, ActiveMonitoredItems: 0, QueuedHistoryReads: 50, LastProgressUtc: Now.AddSeconds(-60)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Healthy); + } + + [Fact] + public void WriteOnlyBurst_StaysIdle_WhenBulkheadEmpty() + { + // A write-only driver that just finished a burst: bulkhead drained, no subscriptions, no + // history reads. Idle — the previous formulation would have faulted here because no + // reads were succeeding even though the driver is perfectly healthy. + var detector = new WedgeDetector(Threshold); + var demand = new DemandSignal(0, 0, 0, Now.AddMinutes(-30)); + + detector.Classify(DriverState.Healthy, demand, Now).ShouldBe(WedgeVerdict.Idle); + } + + [Fact] + public void DemandSignal_HasPendingWork_TrueForAnyNonZeroCounter() + { + new DemandSignal(1, 0, 0, Now).HasPendingWork.ShouldBeTrue(); + new DemandSignal(0, 1, 0, Now).HasPendingWork.ShouldBeTrue(); + new DemandSignal(0, 0, 1, Now).HasPendingWork.ShouldBeTrue(); + new DemandSignal(0, 0, 0, Now).HasPendingWork.ShouldBeFalse(); + } +} -- 2.49.1