chore: organize solution into module folders (Core/Server/Drivers/Client/Tooling)
Group all 69 projects into category subfolders under src/ and tests/ so the Rider Solution Explorer mirrors the module structure. Folders: Core, Server, Drivers (with a nested Driver CLIs subfolder), Client, Tooling. - Move every project folder on disk with git mv (history preserved as renames). - Recompute relative paths in 57 .csproj files: cross-category ProjectReferences, the lib/ HintPath+None refs in Driver.Historian.Wonderware, and the external mxaccessgw refs in Driver.Galaxy and its test project. - Rebuild ZB.MOM.WW.OtOpcUa.slnx with nested solution folders. - Re-prefix project paths in functional scripts (e2e, compliance, smoke SQL, integration, install). Build green (0 errors); unit tests pass. Docs left for a separate pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C only process-recycle companion to <see cref="MemoryTracking"/>. On a
|
||||
/// <see cref="MemoryTrackingAction.HardBreach"/> signal, invokes the supplied
|
||||
/// <see cref="IDriverSupervisor"/> to restart the out-of-process Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #74 and #145. Tier A/B hard-breach on an in-process
|
||||
/// driver would kill every OPC UA session and every co-hosted driver, so for Tier A/B this
|
||||
/// class logs a <b>promotion-to-Tier-C recommendation</b> and does NOT invoke any supervisor.
|
||||
/// A future tier-migration workflow acts on the recommendation.
|
||||
/// </remarks>
|
||||
public sealed class MemoryRecycle
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly IDriverSupervisor? _supervisor;
|
||||
private readonly ILogger<MemoryRecycle> _logger;
|
||||
|
||||
public MemoryRecycle(DriverTier tier, IDriverSupervisor? supervisor, ILogger<MemoryRecycle> logger)
|
||||
{
|
||||
_tier = tier;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handle a <see cref="MemoryTracking"/> classification for the driver. For Tier C with a
|
||||
/// wired supervisor, <c>HardBreach</c> triggers <see cref="IDriverSupervisor.RecycleAsync"/>.
|
||||
/// All other combinations are no-ops with respect to process state (soft breaches + Tier A/B
|
||||
/// hard breaches just log).
|
||||
/// </summary>
|
||||
/// <returns>True when a recycle was requested; false otherwise.</returns>
|
||||
public async Task<bool> HandleAsync(MemoryTrackingAction action, long footprintBytes, CancellationToken cancellationToken)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
case MemoryTrackingAction.SoftBreach:
|
||||
_logger.LogWarning(
|
||||
"Memory soft-breach on driver {DriverId}: footprint={Footprint:N0} bytes, tier={Tier}. Surfaced to Admin; no action.",
|
||||
_supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes, _tier);
|
||||
return false;
|
||||
|
||||
case MemoryTrackingAction.HardBreach when _tier == DriverTier.C && _supervisor is not null:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier C driver {DriverId}: footprint={Footprint:N0} bytes. Requesting supervisor recycle.",
|
||||
_supervisor.DriverInstanceId, footprintBytes);
|
||||
await _supervisor.RecycleAsync($"Memory hard-breach: {footprintBytes} bytes", cancellationToken).ConfigureAwait(false);
|
||||
return true;
|
||||
|
||||
case MemoryTrackingAction.HardBreach:
|
||||
_logger.LogError(
|
||||
"Memory hard-breach on Tier {Tier} in-process driver {DriverId}: footprint={Footprint:N0} bytes. " +
|
||||
"Recommending promotion to Tier C; NOT auto-killing (decisions #74, #145).",
|
||||
_tier, _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes);
|
||||
return false;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier-agnostic memory-footprint tracker. Captures the post-initialize <b>baseline</b>
|
||||
/// from the first samples after <c>IDriver.InitializeAsync</c>, then classifies each
|
||||
/// subsequent sample against a hybrid soft/hard threshold per
|
||||
/// <c>docs/v2/plan.md</c> decision #146 — <c>soft = max(multiplier × baseline, baseline + floor)</c>,
|
||||
/// <c>hard = 2 × soft</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Per decision #145, this tracker <b>never kills a process</b>. Soft and hard breaches
|
||||
/// log + surface to the Admin UI via <c>DriverInstanceResilienceStatus</c>. The matching
|
||||
/// process-level recycle protection lives in a separate <c>MemoryRecycle</c> that activates
|
||||
/// for Tier C drivers only (where the driver runs out-of-process behind a supervisor that
|
||||
/// can safely restart it without tearing down the OPC UA session or co-hosted in-proc
|
||||
/// drivers).</para>
|
||||
///
|
||||
/// <para>Baseline capture: the tracker starts in <see cref="TrackingPhase.WarmingUp"/> for
|
||||
/// <see cref="BaselineWindow"/> (default 5 min). During that window samples are collected;
|
||||
/// the baseline is computed as the median once the window elapses. Before that point every
|
||||
/// classification returns <see cref="MemoryTrackingAction.Warming"/>.</para>
|
||||
/// </remarks>
|
||||
public sealed class MemoryTracking
|
||||
{
|
||||
private readonly DriverTier _tier;
|
||||
private readonly TimeSpan _baselineWindow;
|
||||
private readonly List<long> _warmupSamples = [];
|
||||
private long _baselineBytes;
|
||||
private TrackingPhase _phase = TrackingPhase.WarmingUp;
|
||||
private DateTime? _warmupStartUtc;
|
||||
|
||||
/// <summary>Tier-default multiplier/floor constants per decision #146.</summary>
|
||||
public static (int Multiplier, long FloorBytes) GetTierConstants(DriverTier tier) => tier switch
|
||||
{
|
||||
DriverTier.A => (Multiplier: 3, FloorBytes: 50L * 1024 * 1024),
|
||||
DriverTier.B => (Multiplier: 3, FloorBytes: 100L * 1024 * 1024),
|
||||
DriverTier.C => (Multiplier: 2, FloorBytes: 500L * 1024 * 1024),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No memory-tracking constants defined for tier {tier}."),
|
||||
};
|
||||
|
||||
/// <summary>Window over which post-init samples are collected to compute the baseline.</summary>
|
||||
public TimeSpan BaselineWindow => _baselineWindow;
|
||||
|
||||
/// <summary>Current phase: <see cref="TrackingPhase.WarmingUp"/> or <see cref="TrackingPhase.Steady"/>.</summary>
|
||||
public TrackingPhase Phase => _phase;
|
||||
|
||||
/// <summary>Captured baseline; 0 until warmup completes.</summary>
|
||||
public long BaselineBytes => _baselineBytes;
|
||||
|
||||
/// <summary>Effective soft threshold (zero while warming up).</summary>
|
||||
public long SoftThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes);
|
||||
|
||||
/// <summary>Effective hard threshold = 2 × soft (zero while warming up).</summary>
|
||||
public long HardThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes) * 2;
|
||||
|
||||
public MemoryTracking(DriverTier tier, TimeSpan? baselineWindow = null)
|
||||
{
|
||||
_tier = tier;
|
||||
_baselineWindow = baselineWindow ?? TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Submit a memory-footprint sample. Returns the action the caller should surface.
|
||||
/// During warmup, always returns <see cref="MemoryTrackingAction.Warming"/> and accumulates
|
||||
/// samples; once the window elapses the first steady-phase sample triggers baseline capture
|
||||
/// (median of warmup samples).
|
||||
/// </summary>
|
||||
public MemoryTrackingAction Sample(long footprintBytes, DateTime utcNow)
|
||||
{
|
||||
if (_phase == TrackingPhase.WarmingUp)
|
||||
{
|
||||
_warmupStartUtc ??= utcNow;
|
||||
_warmupSamples.Add(footprintBytes);
|
||||
if (utcNow - _warmupStartUtc.Value >= _baselineWindow && _warmupSamples.Count > 0)
|
||||
{
|
||||
_baselineBytes = ComputeMedian(_warmupSamples);
|
||||
_phase = TrackingPhase.Steady;
|
||||
}
|
||||
else
|
||||
{
|
||||
return MemoryTrackingAction.Warming;
|
||||
}
|
||||
}
|
||||
|
||||
if (footprintBytes >= HardThresholdBytes) return MemoryTrackingAction.HardBreach;
|
||||
if (footprintBytes >= SoftThresholdBytes) return MemoryTrackingAction.SoftBreach;
|
||||
return MemoryTrackingAction.None;
|
||||
}
|
||||
|
||||
private static long ComputeSoft(DriverTier tier, long baseline)
|
||||
{
|
||||
var (multiplier, floor) = GetTierConstants(tier);
|
||||
return Math.Max(multiplier * baseline, baseline + floor);
|
||||
}
|
||||
|
||||
private static long ComputeMedian(List<long> samples)
|
||||
{
|
||||
var sorted = samples.Order().ToArray();
|
||||
var mid = sorted.Length / 2;
|
||||
return sorted.Length % 2 == 1
|
||||
? sorted[mid]
|
||||
: (sorted[mid - 1] + sorted[mid]) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Phase of a <see cref="MemoryTracking"/> lifecycle.</summary>
|
||||
public enum TrackingPhase
|
||||
{
|
||||
/// <summary>Collecting post-init samples; baseline not yet computed.</summary>
|
||||
WarmingUp,
|
||||
|
||||
/// <summary>Baseline captured; every sample classified against soft/hard thresholds.</summary>
|
||||
Steady,
|
||||
}
|
||||
|
||||
/// <summary>Classification the tracker returns per sample.</summary>
|
||||
public enum MemoryTrackingAction
|
||||
{
|
||||
/// <summary>Baseline not yet captured; sample collected, no threshold check.</summary>
|
||||
Warming,
|
||||
|
||||
/// <summary>Below soft threshold.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Between soft and hard thresholds — log + surface, no action.</summary>
|
||||
SoftBreach,
|
||||
|
||||
/// <summary>
|
||||
/// ≥ hard threshold. Log + surface + (Tier C only, via <c>MemoryRecycle</c>) request
|
||||
/// process recycle via the driver supervisor. Tier A/B breach never invokes any
|
||||
/// kill path per decisions #145 and #74.
|
||||
/// </summary>
|
||||
HardBreach,
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Tier C opt-in periodic-recycle driver per <c>docs/v2/plan.md</c> decision #67.
|
||||
/// A tick method advanced by the caller (fed by a background timer in prod; by test clock
|
||||
/// in unit tests) decides whether the configured interval has elapsed and, if so, drives the
|
||||
/// supplied <see cref="IDriverSupervisor"/> to recycle the Host.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Tier A/B drivers MUST NOT use this class — scheduled recycle for in-process drivers would
|
||||
/// kill every OPC UA session and every co-hosted driver. The ctor throws when constructed
|
||||
/// with any tier other than C to make the misuse structurally impossible.
|
||||
///
|
||||
/// <para>Keeps no background thread of its own — callers invoke <see cref="TickAsync"/> on
|
||||
/// their ambient scheduler tick (Phase 6.1 Stream C's health-endpoint host runs one). That
|
||||
/// decouples the unit under test from wall-clock time and thread-pool scheduling.</para>
|
||||
/// </remarks>
|
||||
public sealed class ScheduledRecycleScheduler
|
||||
{
|
||||
private readonly TimeSpan _recycleInterval;
|
||||
private readonly IDriverSupervisor _supervisor;
|
||||
private readonly ILogger<ScheduledRecycleScheduler> _logger;
|
||||
private DateTime _nextRecycleUtc;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the scheduler for a Tier C driver. Throws if <paramref name="tier"/> isn't C.
|
||||
/// </summary>
|
||||
/// <param name="tier">Driver tier; must be <see cref="DriverTier.C"/>.</param>
|
||||
/// <param name="recycleInterval">Interval between recycles (e.g. 7 days).</param>
|
||||
/// <param name="startUtc">Anchor time; next recycle fires at <paramref name="startUtc"/> + <paramref name="recycleInterval"/>.</param>
|
||||
/// <param name="supervisor">Supervisor that performs the actual recycle.</param>
|
||||
/// <param name="logger">Diagnostic sink.</param>
|
||||
public ScheduledRecycleScheduler(
|
||||
DriverTier tier,
|
||||
TimeSpan recycleInterval,
|
||||
DateTime startUtc,
|
||||
IDriverSupervisor supervisor,
|
||||
ILogger<ScheduledRecycleScheduler> logger)
|
||||
{
|
||||
if (tier != DriverTier.C)
|
||||
throw new ArgumentException(
|
||||
$"ScheduledRecycleScheduler is Tier C only (got {tier}). " +
|
||||
"In-process drivers must not use scheduled recycle; see decisions #74 and #145.",
|
||||
nameof(tier));
|
||||
|
||||
if (recycleInterval <= TimeSpan.Zero)
|
||||
throw new ArgumentException("RecycleInterval must be positive.", nameof(recycleInterval));
|
||||
|
||||
_recycleInterval = recycleInterval;
|
||||
_supervisor = supervisor;
|
||||
_logger = logger;
|
||||
_nextRecycleUtc = startUtc + recycleInterval;
|
||||
}
|
||||
|
||||
/// <summary>Next scheduled recycle UTC. Advances by <see cref="RecycleInterval"/> on each fire.</summary>
|
||||
public DateTime NextRecycleUtc => _nextRecycleUtc;
|
||||
|
||||
/// <summary>Recycle interval this scheduler was constructed with.</summary>
|
||||
public TimeSpan RecycleInterval => _recycleInterval;
|
||||
|
||||
/// <summary>
|
||||
/// Tick the scheduler forward. If <paramref name="utcNow"/> is past
|
||||
/// <see cref="NextRecycleUtc"/>, requests a recycle from the supervisor and advances
|
||||
/// <see cref="NextRecycleUtc"/> by exactly one interval. Returns true when a recycle fired.
|
||||
/// </summary>
|
||||
public async Task<bool> TickAsync(DateTime utcNow, CancellationToken cancellationToken)
|
||||
{
|
||||
if (utcNow < _nextRecycleUtc)
|
||||
return false;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Scheduled recycle due for Tier C driver {DriverId} at {Now:o}; advancing next to {Next:o}.",
|
||||
_supervisor.DriverInstanceId, utcNow, _nextRecycleUtc + _recycleInterval);
|
||||
|
||||
await _supervisor.RecycleAsync("Scheduled periodic recycle", cancellationToken).ConfigureAwait(false);
|
||||
_nextRecycleUtc += _recycleInterval;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>Request an immediate recycle outside the schedule (e.g. MemoryRecycle hard-breach escalation).</summary>
|
||||
public Task RequestRecycleNowAsync(string reason, CancellationToken cancellationToken) =>
|
||||
_supervisor.RecycleAsync(reason, cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
|
||||
|
||||
/// <summary>
|
||||
/// Demand-aware driver-wedge detector per <c>docs/v2/plan.md</c> decision #147.
|
||||
/// Flips a driver to <see cref="WedgeVerdict.Faulted"/> only when BOTH of the following hold:
|
||||
/// (a) there is pending work outstanding, AND (b) no progress has been observed for longer
|
||||
/// than <see cref="Threshold"/>. Idle drivers, write-only burst drivers, and subscription-only
|
||||
/// drivers whose signals don't arrive regularly all stay Healthy.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Pending work signal is supplied by the caller via <see cref="DemandSignal"/>:
|
||||
/// non-zero Polly bulkhead depth, ≥1 active MonitoredItem, or ≥1 queued historian read
|
||||
/// each qualifies. The detector itself is state-light: all it remembers is the last
|
||||
/// <c>LastProgressUtc</c> it saw and the last wedge verdict. No history buffer.</para>
|
||||
///
|
||||
/// <para>Default threshold per plan: <c>5 × PublishingInterval</c>, with a minimum of 60 s.
|
||||
/// Concrete values are driver-agnostic and configured per-instance by the caller.</para>
|
||||
/// </remarks>
|
||||
public sealed class WedgeDetector
|
||||
{
|
||||
/// <summary>Wedge-detection threshold; pass < 60 s and the detector clamps to 60 s.</summary>
|
||||
public TimeSpan Threshold { get; }
|
||||
|
||||
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
|
||||
public WedgeDetector(TimeSpan threshold)
|
||||
{
|
||||
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classify the current state against the demand signal. Does not retain state across
|
||||
/// calls — each call is self-contained; the caller owns the <c>LastProgressUtc</c> clock.
|
||||
/// </summary>
|
||||
public WedgeVerdict Classify(DriverState state, DemandSignal demand, DateTime utcNow)
|
||||
{
|
||||
if (state != DriverState.Healthy)
|
||||
return WedgeVerdict.NotApplicable;
|
||||
|
||||
if (!demand.HasPendingWork)
|
||||
return WedgeVerdict.Idle;
|
||||
|
||||
var sinceProgress = utcNow - demand.LastProgressUtc;
|
||||
return sinceProgress > Threshold ? WedgeVerdict.Faulted : WedgeVerdict.Healthy;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Caller-supplied demand snapshot. All three counters are OR'd — any non-zero means work
|
||||
/// is outstanding, which is the trigger for checking the <see cref="LastProgressUtc"/> clock.
|
||||
/// </summary>
|
||||
/// <param name="BulkheadDepth">Polly bulkhead depth (in-flight capability calls).</param>
|
||||
/// <param name="ActiveMonitoredItems">Number of live OPC UA MonitoredItems bound to this driver.</param>
|
||||
/// <param name="QueuedHistoryReads">Pending historian-read requests the driver owes the server.</param>
|
||||
/// <param name="LastProgressUtc">Last time the driver reported a successful unit of work (read, subscribe-ack, publish).</param>
|
||||
public readonly record struct DemandSignal(
|
||||
int BulkheadDepth,
|
||||
int ActiveMonitoredItems,
|
||||
int QueuedHistoryReads,
|
||||
DateTime LastProgressUtc)
|
||||
{
|
||||
/// <summary>True when any of the three counters is > 0.</summary>
|
||||
public bool HasPendingWork => BulkheadDepth > 0 || ActiveMonitoredItems > 0 || QueuedHistoryReads > 0;
|
||||
}
|
||||
|
||||
/// <summary>Outcome of a single <see cref="WedgeDetector.Classify"/> call.</summary>
|
||||
public enum WedgeVerdict
|
||||
{
|
||||
/// <summary>Driver wasn't Healthy to begin with — wedge detection doesn't apply.</summary>
|
||||
NotApplicable,
|
||||
|
||||
/// <summary>Driver claims Healthy + no pending work → stays Healthy.</summary>
|
||||
Idle,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has made progress within the threshold → stays Healthy.</summary>
|
||||
Healthy,
|
||||
|
||||
/// <summary>Driver claims Healthy + has pending work + has NOT made progress within the threshold → wedged.</summary>
|
||||
Faulted,
|
||||
}
|
||||
Reference in New Issue
Block a user