chore: organize solution into module folders (Core/Server/Drivers/Client/Tooling)
Group all 69 projects into category subfolders under src/ and tests/ so the Rider Solution Explorer mirrors the module structure. Folders: Core, Server, Drivers (with a nested Driver CLIs subfolder), Client, Tooling. - Move every project folder on disk with git mv (history preserved as renames). - Recompute relative paths in 57 .csproj files: cross-category ProjectReferences, the lib/ HintPath+None refs in Driver.Historian.Wonderware, and the external mxaccessgw refs in Driver.Galaxy and its test project. - Rebuild ZB.MOM.WW.OtOpcUa.slnx with nested solution folders. - Re-prefix project paths in functional scripts (e2e, compliance, smoke SQL, integration, install). Build green (0 errors); unit tests pass. Docs left for a separate pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Wraps the three mutating surfaces of <see cref="IAlarmSource"/>
|
||||
/// (<see cref="IAlarmSource.SubscribeAlarmsAsync"/>, <see cref="IAlarmSource.UnsubscribeAlarmsAsync"/>,
|
||||
/// <see cref="IAlarmSource.AcknowledgeAsync"/>) through <see cref="CapabilityInvoker"/> so the
|
||||
/// Phase 6.1 resilience pipeline runs — retry semantics match
|
||||
/// <see cref="DriverCapability.AlarmSubscribe"/> (retries by default) and
|
||||
/// <see cref="DriverCapability.AlarmAcknowledge"/> (does NOT retry per decision #143).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Multi-host dispatch: when the driver implements <see cref="IPerCallHostResolver"/>,
|
||||
/// each source-node-id is resolved individually + grouped by host so a dead PLC inside a
|
||||
/// multi-device driver doesn't poison the sibling hosts' breakers. Drivers with a single
|
||||
/// host fall back to <see cref="IDriver.DriverInstanceId"/> as the single-host key.</para>
|
||||
///
|
||||
/// <para>Why this lives here + not on <see cref="CapabilityInvoker"/>: alarm surfaces have a
|
||||
/// handle-returning shape (SubscribeAlarmsAsync returns <see cref="IAlarmSubscriptionHandle"/>)
|
||||
/// + a per-call fan-out (AcknowledgeAsync gets a batch of
|
||||
/// <see cref="AlarmAcknowledgeRequest"/>s that may span multiple hosts). Keeping the fan-out
|
||||
/// logic here keeps the invoker's execute-overloads narrow.</para>
|
||||
/// </remarks>
|
||||
public sealed class AlarmSurfaceInvoker
|
||||
{
|
||||
private readonly CapabilityInvoker _invoker;
|
||||
private readonly IAlarmSource _alarmSource;
|
||||
private readonly IPerCallHostResolver? _hostResolver;
|
||||
private readonly string _defaultHost;
|
||||
|
||||
public AlarmSurfaceInvoker(
|
||||
CapabilityInvoker invoker,
|
||||
IAlarmSource alarmSource,
|
||||
string defaultHost,
|
||||
IPerCallHostResolver? hostResolver = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(invoker);
|
||||
ArgumentNullException.ThrowIfNull(alarmSource);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(defaultHost);
|
||||
|
||||
_invoker = invoker;
|
||||
_alarmSource = alarmSource;
|
||||
_defaultHost = defaultHost;
|
||||
_hostResolver = hostResolver;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Subscribe to alarm events for a set of source node ids, fanning out by resolved host
|
||||
/// so per-host breakers / bulkheads apply. Returns one handle per host — callers that
|
||||
/// don't care about per-host separation may concatenate them.
|
||||
/// </summary>
|
||||
public async Task<IReadOnlyList<IAlarmSubscriptionHandle>> SubscribeAsync(
|
||||
IReadOnlyList<string> sourceNodeIds,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(sourceNodeIds);
|
||||
if (sourceNodeIds.Count == 0) return [];
|
||||
|
||||
var byHost = GroupByHost(sourceNodeIds);
|
||||
var handles = new List<IAlarmSubscriptionHandle>(byHost.Count);
|
||||
foreach (var (host, ids) in byHost)
|
||||
{
|
||||
var handle = await _invoker.ExecuteAsync(
|
||||
DriverCapability.AlarmSubscribe,
|
||||
host,
|
||||
async ct => await _alarmSource.SubscribeAlarmsAsync(ids, ct).ConfigureAwait(false),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
handles.Add(handle);
|
||||
}
|
||||
return handles;
|
||||
}
|
||||
|
||||
/// <summary>Cancel an alarm subscription. Routes through the AlarmSubscribe pipeline for parity.</summary>
|
||||
public ValueTask UnsubscribeAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(handle);
|
||||
return _invoker.ExecuteAsync(
|
||||
DriverCapability.AlarmSubscribe,
|
||||
_defaultHost,
|
||||
async ct => await _alarmSource.UnsubscribeAlarmsAsync(handle, ct).ConfigureAwait(false),
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Acknowledge alarms. Fans out by resolved host; each host's batch runs through the
|
||||
/// AlarmAcknowledge pipeline (no-retry per decision #143 — an alarm-ack is not idempotent
|
||||
/// at the plant-floor acknowledgement level even if the OPC UA spec permits re-issue).
|
||||
/// </summary>
|
||||
public async Task AcknowledgeAsync(
|
||||
IReadOnlyList<AlarmAcknowledgeRequest> acknowledgements,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(acknowledgements);
|
||||
if (acknowledgements.Count == 0) return;
|
||||
|
||||
var byHost = _hostResolver is null
|
||||
? new Dictionary<string, List<AlarmAcknowledgeRequest>> { [_defaultHost] = acknowledgements.ToList() }
|
||||
: acknowledgements
|
||||
.GroupBy(a => _hostResolver.ResolveHost(a.SourceNodeId))
|
||||
.ToDictionary(g => g.Key, g => g.ToList());
|
||||
|
||||
foreach (var (host, batch) in byHost)
|
||||
{
|
||||
var batchSnapshot = batch; // capture for the lambda
|
||||
await _invoker.ExecuteAsync(
|
||||
DriverCapability.AlarmAcknowledge,
|
||||
host,
|
||||
async ct => await _alarmSource.AcknowledgeAsync(batchSnapshot, ct).ConfigureAwait(false),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private Dictionary<string, List<string>> GroupByHost(IReadOnlyList<string> sourceNodeIds)
|
||||
{
|
||||
if (_hostResolver is null)
|
||||
return new Dictionary<string, List<string>> { [_defaultHost] = sourceNodeIds.ToList() };
|
||||
|
||||
var result = new Dictionary<string, List<string>>(StringComparer.Ordinal);
|
||||
foreach (var id in sourceNodeIds)
|
||||
{
|
||||
var host = _hostResolver.ResolveHost(id);
|
||||
if (!result.TryGetValue(host, out var list))
|
||||
result[host] = list = new List<string>();
|
||||
list.Add(id);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
using Polly;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Executes driver-capability calls through a shared Polly pipeline. One invoker per
|
||||
/// <c>(DriverInstance, IDriver)</c> pair; the underlying <see cref="DriverResiliencePipelineBuilder"/>
|
||||
/// is process-singleton so all invokers share its cache.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143-144 and Phase 6.1 Stream A.3. The server's dispatch
|
||||
/// layer routes every capability call (<c>IReadable.ReadAsync</c>, <c>IWritable.WriteAsync</c>,
|
||||
/// <c>ITagDiscovery.DiscoverAsync</c>, <c>ISubscribable.SubscribeAsync/UnsubscribeAsync</c>,
|
||||
/// <c>IHostConnectivityProbe</c> probe loop, <c>IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync</c>,
|
||||
/// and all four <c>IHistoryProvider</c> reads) through this invoker.
|
||||
/// </remarks>
|
||||
public sealed class CapabilityInvoker
|
||||
{
|
||||
private readonly DriverResiliencePipelineBuilder _builder;
|
||||
private readonly string _driverInstanceId;
|
||||
private readonly string _driverType;
|
||||
private readonly Func<DriverResilienceOptions> _optionsAccessor;
|
||||
private readonly DriverResilienceStatusTracker? _statusTracker;
|
||||
|
||||
/// <summary>
|
||||
/// Construct an invoker for one driver instance.
|
||||
/// </summary>
|
||||
/// <param name="builder">Shared, process-singleton pipeline builder.</param>
|
||||
/// <param name="driverInstanceId">The <c>DriverInstance.Id</c> column value.</param>
|
||||
/// <param name="optionsAccessor">
|
||||
/// Snapshot accessor for the current resilience options. Invoked per call so Admin-edit +
|
||||
/// pipeline-invalidate can take effect without restarting the invoker.
|
||||
/// </param>
|
||||
/// <param name="driverType">Driver type name for structured-log enrichment (e.g. <c>"Modbus"</c>).</param>
|
||||
/// <param name="statusTracker">Optional resilience-status tracker. When wired, every capability call records start/complete so Admin <c>/hosts</c> can surface <see cref="ResilienceStatusSnapshot.CurrentInFlight"/> as the bulkhead-depth proxy.</param>
|
||||
public CapabilityInvoker(
|
||||
DriverResiliencePipelineBuilder builder,
|
||||
string driverInstanceId,
|
||||
Func<DriverResilienceOptions> optionsAccessor,
|
||||
string driverType = "Unknown",
|
||||
DriverResilienceStatusTracker? statusTracker = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
ArgumentNullException.ThrowIfNull(optionsAccessor);
|
||||
|
||||
_builder = builder;
|
||||
_driverInstanceId = driverInstanceId;
|
||||
_driverType = driverType;
|
||||
_optionsAccessor = optionsAccessor;
|
||||
_statusTracker = statusTracker;
|
||||
}
|
||||
|
||||
/// <summary>Execute a capability call returning a value, honoring the per-capability pipeline.</summary>
|
||||
/// <typeparam name="TResult">Return type of the underlying driver call.</typeparam>
|
||||
public async ValueTask<TResult> ExecuteAsync<TResult>(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
_statusTracker?.RecordCallStart(_driverInstanceId, hostName);
|
||||
try
|
||||
{
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_statusTracker?.RecordCallComplete(_driverInstanceId, hostName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Execute a void-returning capability call, honoring the per-capability pipeline.</summary>
|
||||
public async ValueTask ExecuteAsync(
|
||||
DriverCapability capability,
|
||||
string hostName,
|
||||
Func<CancellationToken, ValueTask> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
var pipeline = ResolvePipeline(capability, hostName);
|
||||
_statusTracker?.RecordCallStart(_driverInstanceId, hostName);
|
||||
try
|
||||
{
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_statusTracker?.RecordCallComplete(_driverInstanceId, hostName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Execute a <see cref="DriverCapability.Write"/> call honoring <see cref="WriteIdempotentAttribute"/>
|
||||
/// semantics — if <paramref name="isIdempotent"/> is <c>false</c>, retries are disabled regardless
|
||||
/// of the tag-level configuration (the pipeline for a non-idempotent write never retries per
|
||||
/// decisions #44-45). If <c>true</c>, the call runs through the capability's pipeline which may
|
||||
/// retry when the tier configuration permits.
|
||||
/// </summary>
|
||||
public async ValueTask<TResult> ExecuteWriteAsync<TResult>(
|
||||
string hostName,
|
||||
bool isIdempotent,
|
||||
Func<CancellationToken, ValueTask<TResult>> callSite,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(callSite);
|
||||
|
||||
if (!isIdempotent)
|
||||
{
|
||||
var noRetryOptions = _optionsAccessor() with
|
||||
{
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
|
||||
},
|
||||
};
|
||||
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);
|
||||
using (LogContextEnricher.Push(_driverInstanceId, _driverType, DriverCapability.Write, LogContextEnricher.NewCorrelationId()))
|
||||
{
|
||||
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
return await ExecuteAsync(DriverCapability.Write, hostName, callSite, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private ResiliencePipeline ResolvePipeline(DriverCapability capability, string hostName) =>
|
||||
_builder.GetOrCreate(_driverInstanceId, hostName, capability, _optionsAccessor());
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier × per-capability resilience policy configuration for a driver instance.
|
||||
/// Bound from <c>DriverInstance.ResilienceConfig</c> JSON (nullable column; null = tier defaults).
|
||||
/// Per <c>docs/v2/plan.md</c> decisions #143 and #144.
|
||||
/// </summary>
|
||||
public sealed record DriverResilienceOptions
|
||||
{
|
||||
/// <summary>Tier the owning driver type is registered as; drives the default map.</summary>
|
||||
public required DriverTier Tier { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-capability policy overrides. Capabilities absent from this map fall back to
|
||||
/// <see cref="GetTierDefaults(DriverTier)"/> for the configured <see cref="Tier"/>.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<DriverCapability, CapabilityPolicy> CapabilityPolicies { get; init; }
|
||||
= new Dictionary<DriverCapability, CapabilityPolicy>();
|
||||
|
||||
/// <summary>Bulkhead (max concurrent in-flight calls) for every capability. Default 32.</summary>
|
||||
public int BulkheadMaxConcurrent { get; init; } = 32;
|
||||
|
||||
/// <summary>
|
||||
/// Bulkhead queue depth. Zero = no queueing; overflow fails fast with
|
||||
/// <c>BulkheadRejectedException</c>. Default 64.
|
||||
/// </summary>
|
||||
public int BulkheadMaxQueue { get; init; } = 64;
|
||||
|
||||
/// <summary>
|
||||
/// Periodic scheduled recycle interval for Tier C out-of-process hosts, in seconds.
|
||||
/// Null (the default) = no scheduled recycle; the driver's Host process keeps running
|
||||
/// indefinitely unless a memory breach or operator action triggers a recycle. Only
|
||||
/// respected for <see cref="DriverTier.C"/>; Tier A/B recycle would tear down every
|
||||
/// OPC UA session, so the loader ignores non-null values for those tiers + logs a
|
||||
/// warning (per decisions #74 / #145).
|
||||
/// </summary>
|
||||
public int? RecycleIntervalSeconds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Look up the effective policy for a capability, falling back to tier defaults when no
|
||||
/// override is configured. Never returns null.
|
||||
/// </summary>
|
||||
public CapabilityPolicy Resolve(DriverCapability capability)
|
||||
{
|
||||
if (CapabilityPolicies.TryGetValue(capability, out var policy))
|
||||
return policy;
|
||||
|
||||
var defaults = GetTierDefaults(Tier);
|
||||
return defaults[capability];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-tier per-capability default policy table, per decisions #143-144 and the Phase 6.1
|
||||
/// Stream A.2 specification. Retries skipped on <see cref="DriverCapability.Write"/> and
|
||||
/// <see cref="DriverCapability.AlarmAcknowledge"/> regardless of tier.
|
||||
/// </summary>
|
||||
public static IReadOnlyDictionary<DriverCapability, CapabilityPolicy> GetTierDefaults(DriverTier tier) =>
|
||||
tier switch
|
||||
{
|
||||
DriverTier.A => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 5, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.B => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 4, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 3),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 8, RetryCount: 0, BreakerFailureThreshold: 5),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 5),
|
||||
},
|
||||
DriverTier.C => new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Read] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Write] = new(TimeoutSeconds: 10, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Discover] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Subscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.Probe] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 15, RetryCount: 0, BreakerFailureThreshold: 0),
|
||||
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
|
||||
},
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No default policy table defined for tier {tier}."),
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Policy for one capability on one driver instance.</summary>
|
||||
/// <param name="TimeoutSeconds">Per-call timeout (wraps the inner Polly execution).</param>
|
||||
/// <param name="RetryCount">Number of retry attempts after the first failure; zero = no retry.</param>
|
||||
/// <param name="BreakerFailureThreshold">
|
||||
/// Consecutive-failure count that opens the circuit breaker; zero = no breaker
|
||||
/// (Tier C uses the supervisor's process-level breaker instead, per decision #68).
|
||||
/// </param>
|
||||
public sealed record CapabilityPolicy(int TimeoutSeconds, int RetryCount, int BreakerFailureThreshold);
|
||||
@@ -0,0 +1,132 @@
|
||||
using System.Text.Json;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Parses the <c>DriverInstance.ResilienceConfig</c> JSON column into a
|
||||
/// <see cref="DriverResilienceOptions"/> instance layered on top of the tier defaults.
|
||||
/// Every key in the JSON is optional; missing keys fall back to the tier defaults from
|
||||
/// <see cref="DriverResilienceOptions.GetTierDefaults(DriverTier)"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>Example JSON shape per Phase 6.1 Stream A.2:</para>
|
||||
/// <code>
|
||||
/// {
|
||||
/// "bulkheadMaxConcurrent": 16,
|
||||
/// "bulkheadMaxQueue": 64,
|
||||
/// "capabilityPolicies": {
|
||||
/// "Read": { "timeoutSeconds": 5, "retryCount": 5, "breakerFailureThreshold": 3 },
|
||||
/// "Write": { "timeoutSeconds": 5, "retryCount": 0, "breakerFailureThreshold": 5 }
|
||||
/// }
|
||||
/// }
|
||||
/// </code>
|
||||
///
|
||||
/// <para>Unrecognised keys + values are ignored so future shapes land without a migration.
|
||||
/// Per-capability overrides are layered on top of tier defaults — a partial policy (only
|
||||
/// some of TimeoutSeconds/RetryCount/BreakerFailureThreshold) fills in the other fields
|
||||
/// from the tier default for that capability.</para>
|
||||
///
|
||||
/// <para>Parser failures (malformed JSON, type mismatches) fall back to pure tier defaults
|
||||
/// + surface through an out-parameter diagnostic. Callers may log the diagnostic but should
|
||||
/// NOT fail driver startup — a misconfigured ResilienceConfig should never brick a
|
||||
/// working driver.</para>
|
||||
/// </remarks>
|
||||
public static class DriverResilienceOptionsParser
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOpts = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
AllowTrailingCommas = true,
|
||||
ReadCommentHandling = JsonCommentHandling.Skip,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Parse the JSON payload layered on <paramref name="tier"/>'s defaults. Returns the
|
||||
/// effective options; <paramref name="parseDiagnostic"/> is null on success, or a
|
||||
/// human-readable error message when the JSON was malformed (options still returned
|
||||
/// = tier defaults).
|
||||
/// </summary>
|
||||
public static DriverResilienceOptions ParseOrDefaults(
|
||||
DriverTier tier,
|
||||
string? resilienceConfigJson,
|
||||
out string? parseDiagnostic)
|
||||
{
|
||||
parseDiagnostic = null;
|
||||
var baseDefaults = DriverResilienceOptions.GetTierDefaults(tier);
|
||||
var baseOptions = new DriverResilienceOptions { Tier = tier, CapabilityPolicies = baseDefaults };
|
||||
|
||||
if (string.IsNullOrWhiteSpace(resilienceConfigJson))
|
||||
return baseOptions;
|
||||
|
||||
ResilienceConfigShape? shape;
|
||||
try
|
||||
{
|
||||
shape = JsonSerializer.Deserialize<ResilienceConfigShape>(resilienceConfigJson, JsonOpts);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
parseDiagnostic = $"ResilienceConfig JSON malformed; falling back to tier {tier} defaults. Detail: {ex.Message}";
|
||||
return baseOptions;
|
||||
}
|
||||
|
||||
if (shape is null) return baseOptions;
|
||||
|
||||
var merged = new Dictionary<DriverCapability, CapabilityPolicy>(baseDefaults);
|
||||
if (shape.CapabilityPolicies is not null)
|
||||
{
|
||||
foreach (var (capName, overridePolicy) in shape.CapabilityPolicies)
|
||||
{
|
||||
if (!Enum.TryParse<DriverCapability>(capName, ignoreCase: true, out var capability))
|
||||
{
|
||||
parseDiagnostic ??= $"Unknown capability '{capName}' in ResilienceConfig; skipped.";
|
||||
continue;
|
||||
}
|
||||
|
||||
var basePolicy = merged[capability];
|
||||
merged[capability] = new CapabilityPolicy(
|
||||
TimeoutSeconds: overridePolicy.TimeoutSeconds ?? basePolicy.TimeoutSeconds,
|
||||
RetryCount: overridePolicy.RetryCount ?? basePolicy.RetryCount,
|
||||
BreakerFailureThreshold: overridePolicy.BreakerFailureThreshold ?? basePolicy.BreakerFailureThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
// Scheduled recycle is Tier C only — reject a configured interval on Tier A/B as a
|
||||
// misconfiguration surface rather than silently honouring it (recycling an in-process
|
||||
// driver would kill every OPC UA session + every co-hosted driver, per decision #74).
|
||||
int? recycleIntervalSeconds = null;
|
||||
if (shape.RecycleIntervalSeconds is int secs)
|
||||
{
|
||||
if (secs <= 0)
|
||||
parseDiagnostic ??= $"RecycleIntervalSeconds must be positive; got {secs} — ignoring.";
|
||||
else if (tier != DriverTier.C)
|
||||
parseDiagnostic ??= $"RecycleIntervalSeconds is Tier C only; Tier {tier} driver will not scheduled-recycle.";
|
||||
else
|
||||
recycleIntervalSeconds = secs;
|
||||
}
|
||||
|
||||
return new DriverResilienceOptions
|
||||
{
|
||||
Tier = tier,
|
||||
CapabilityPolicies = merged,
|
||||
BulkheadMaxConcurrent = shape.BulkheadMaxConcurrent ?? baseOptions.BulkheadMaxConcurrent,
|
||||
BulkheadMaxQueue = shape.BulkheadMaxQueue ?? baseOptions.BulkheadMaxQueue,
|
||||
RecycleIntervalSeconds = recycleIntervalSeconds,
|
||||
};
|
||||
}
|
||||
|
||||
private sealed class ResilienceConfigShape
|
||||
{
|
||||
public int? BulkheadMaxConcurrent { get; set; }
|
||||
public int? BulkheadMaxQueue { get; set; }
|
||||
public int? RecycleIntervalSeconds { get; set; }
|
||||
public Dictionary<string, CapabilityPolicyShape>? CapabilityPolicies { get; set; }
|
||||
}
|
||||
|
||||
private sealed class CapabilityPolicyShape
|
||||
{
|
||||
public int? TimeoutSeconds { get; set; }
|
||||
public int? RetryCount { get; set; }
|
||||
public int? BreakerFailureThreshold { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Polly;
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Retry;
|
||||
using Polly.Timeout;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Builds and caches Polly resilience pipelines keyed on
|
||||
/// <c>(DriverInstanceId, HostName, DriverCapability)</c>. One dead PLC behind a multi-device
|
||||
/// driver cannot open the circuit breaker for healthy sibling hosts.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per <c>docs/v2/plan.md</c> decision #144 (per-device isolation). Composition from outside-in:
|
||||
/// <b>Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead</b>.
|
||||
///
|
||||
/// <para>Pipeline resolution is lock-free on the hot path: the inner
|
||||
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> caches a <see cref="ResiliencePipeline"/> per key;
|
||||
/// first-call cost is one <see cref="ResiliencePipelineBuilder"/>.Build. Thereafter reads are O(1).</para>
|
||||
/// </remarks>
|
||||
public sealed class DriverResiliencePipelineBuilder
|
||||
{
|
||||
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly DriverResilienceStatusTracker? _statusTracker;
|
||||
|
||||
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
|
||||
/// <param name="timeProvider">Clock source for pipeline timeouts + breaker sampling. Defaults to system.</param>
|
||||
/// <param name="statusTracker">When non-null, every built pipeline wires Polly telemetry into
|
||||
/// the tracker — retries increment <c>ConsecutiveFailures</c>, breaker-open stamps
|
||||
/// <c>LastBreakerOpenUtc</c>, breaker-close resets failures. Feeds Admin <c>/hosts</c> +
|
||||
/// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
|
||||
/// deployments that don't care about resilience observability).</param>
|
||||
public DriverResiliencePipelineBuilder(
|
||||
TimeProvider? timeProvider = null,
|
||||
DriverResilienceStatusTracker? statusTracker = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_statusTracker = statusTracker;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get or build the pipeline for a given <c>(driver instance, host, capability)</c> triple.
|
||||
/// Calls with the same key + same options reuse the same pipeline instance; the first caller
|
||||
/// wins if a race occurs (both pipelines would be behaviourally identical).
|
||||
/// </summary>
|
||||
/// <param name="driverInstanceId">DriverInstance primary key — opaque to this layer.</param>
|
||||
/// <param name="hostName">
|
||||
/// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the
|
||||
/// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the
|
||||
/// specific PLC so one dead PLC doesn't poison healthy siblings.
|
||||
/// </param>
|
||||
/// <param name="capability">Which capability surface is being called.</param>
|
||||
/// <param name="options">Per-driver-instance options (tier + per-capability overrides).</param>
|
||||
public ResiliencePipeline GetOrCreate(
|
||||
string driverInstanceId,
|
||||
string hostName,
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
|
||||
|
||||
var key = new PipelineKey(driverInstanceId, hostName, capability);
|
||||
return _pipelines.GetOrAdd(key, static (k, state) => Build(
|
||||
k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
|
||||
(capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
|
||||
}
|
||||
|
||||
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
|
||||
public int Invalidate(string driverInstanceId)
|
||||
{
|
||||
var removed = 0;
|
||||
foreach (var key in _pipelines.Keys)
|
||||
{
|
||||
if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _))
|
||||
removed++;
|
||||
}
|
||||
return removed;
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of the current number of cached pipelines. For diagnostics only.</summary>
|
||||
public int CachedPipelineCount => _pipelines.Count;
|
||||
|
||||
private static ResiliencePipeline Build(
|
||||
string driverInstanceId,
|
||||
string hostName,
|
||||
DriverCapability capability,
|
||||
DriverResilienceOptions options,
|
||||
TimeProvider timeProvider,
|
||||
DriverResilienceStatusTracker? tracker)
|
||||
{
|
||||
var policy = options.Resolve(capability);
|
||||
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
|
||||
|
||||
builder.AddTimeout(new TimeoutStrategyOptions
|
||||
{
|
||||
Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds),
|
||||
});
|
||||
|
||||
if (policy.RetryCount > 0)
|
||||
{
|
||||
var retryOptions = new RetryStrategyOptions
|
||||
{
|
||||
MaxRetryAttempts = policy.RetryCount,
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
UseJitter = true,
|
||||
Delay = TimeSpan.FromMilliseconds(100),
|
||||
MaxDelay = TimeSpan.FromSeconds(5),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
};
|
||||
if (tracker is not null)
|
||||
{
|
||||
retryOptions.OnRetry = args =>
|
||||
{
|
||||
tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
}
|
||||
builder.AddRetry(retryOptions);
|
||||
}
|
||||
|
||||
if (policy.BreakerFailureThreshold > 0)
|
||||
{
|
||||
var breakerOptions = new CircuitBreakerStrategyOptions
|
||||
{
|
||||
FailureRatio = 1.0,
|
||||
MinimumThroughput = policy.BreakerFailureThreshold,
|
||||
SamplingDuration = TimeSpan.FromSeconds(30),
|
||||
BreakDuration = TimeSpan.FromSeconds(15),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
|
||||
};
|
||||
if (tracker is not null)
|
||||
{
|
||||
breakerOptions.OnOpened = args =>
|
||||
{
|
||||
tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
breakerOptions.OnClosed = args =>
|
||||
{
|
||||
// Closing the breaker means the target recovered — reset the consecutive-
|
||||
// failure counter so Admin UI stops flashing red for this host.
|
||||
tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
|
||||
return default;
|
||||
};
|
||||
}
|
||||
builder.AddCircuitBreaker(breakerOptions);
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability);
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
using System.Collections.Concurrent;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Process-singleton tracker of live resilience counters per
|
||||
/// <c>(DriverInstanceId, HostName)</c>. Populated by the CapabilityInvoker and the
|
||||
/// MemoryTracking layer; consumed by a HostedService that periodically persists a
|
||||
/// snapshot to the <c>DriverInstanceResilienceStatus</c> table for Admin <c>/hosts</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per Phase 6.1 Stream E. No DB dependency here — the tracker is pure in-memory so
|
||||
/// tests can exercise it without EF Core or SQL Server. The HostedService that writes
|
||||
/// snapshots lives in the Server project (Stream E.2); the actual SignalR push + Blazor
|
||||
/// page refresh (E.3) lands in a follow-up visual-review PR.
|
||||
/// </remarks>
|
||||
public sealed class DriverResilienceStatusTracker
|
||||
{
|
||||
private readonly ConcurrentDictionary<StatusKey, ResilienceStatusSnapshot> _status = new();
|
||||
|
||||
/// <summary>Record a Polly pipeline failure for <paramref name="hostName"/>.</summary>
|
||||
public void RecordFailure(string driverInstanceId, string hostName, DateTime utcNow)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 1, LastSampledUtc = utcNow },
|
||||
(_, existing) => existing with
|
||||
{
|
||||
ConsecutiveFailures = existing.ConsecutiveFailures + 1,
|
||||
LastSampledUtc = utcNow,
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>Reset the consecutive-failure count on a successful pipeline execution.</summary>
|
||||
public void RecordSuccess(string driverInstanceId, string hostName, DateTime utcNow)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 0, LastSampledUtc = utcNow },
|
||||
(_, existing) => existing with
|
||||
{
|
||||
ConsecutiveFailures = 0,
|
||||
LastSampledUtc = utcNow,
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>Record a circuit-breaker open event.</summary>
|
||||
public void RecordBreakerOpen(string driverInstanceId, string hostName, DateTime utcNow)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow },
|
||||
(_, existing) => existing with { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow });
|
||||
}
|
||||
|
||||
/// <summary>Record a process recycle event (Tier C only).</summary>
|
||||
public void RecordRecycle(string driverInstanceId, string hostName, DateTime utcNow)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { LastRecycleUtc = utcNow, LastSampledUtc = utcNow },
|
||||
(_, existing) => existing with { LastRecycleUtc = utcNow, LastSampledUtc = utcNow });
|
||||
}
|
||||
|
||||
/// <summary>Capture / update the MemoryTracking-supplied baseline + current footprint.</summary>
|
||||
public void RecordFootprint(string driverInstanceId, string hostName, long baselineBytes, long currentBytes, DateTime utcNow)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot
|
||||
{
|
||||
BaselineFootprintBytes = baselineBytes,
|
||||
CurrentFootprintBytes = currentBytes,
|
||||
LastSampledUtc = utcNow,
|
||||
},
|
||||
(_, existing) => existing with
|
||||
{
|
||||
BaselineFootprintBytes = baselineBytes,
|
||||
CurrentFootprintBytes = currentBytes,
|
||||
LastSampledUtc = utcNow,
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record the entry of a capability call for this (instance, host). Increments the
|
||||
/// in-flight counter used as the <see cref="ResilienceStatusSnapshot.CurrentInFlight"/>
|
||||
/// surface (a cheap stand-in for Polly bulkhead depth). Paired with
|
||||
/// <see cref="RecordCallComplete"/>; callers use try/finally.
|
||||
/// </summary>
|
||||
public void RecordCallStart(string driverInstanceId, string hostName)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { CurrentInFlight = 1 },
|
||||
(_, existing) => existing with { CurrentInFlight = existing.CurrentInFlight + 1 });
|
||||
}
|
||||
|
||||
/// <summary>Paired with <see cref="RecordCallStart"/> — decrements the in-flight counter.</summary>
|
||||
public void RecordCallComplete(string driverInstanceId, string hostName)
|
||||
{
|
||||
var key = new StatusKey(driverInstanceId, hostName);
|
||||
_status.AddOrUpdate(key,
|
||||
_ => new ResilienceStatusSnapshot { CurrentInFlight = 0 }, // start-without-complete shouldn't happen; clamp to 0
|
||||
(_, existing) => existing with { CurrentInFlight = Math.Max(0, existing.CurrentInFlight - 1) });
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of a specific (instance, host) pair; null if no counters recorded yet.</summary>
|
||||
public ResilienceStatusSnapshot? TryGet(string driverInstanceId, string hostName) =>
|
||||
_status.TryGetValue(new StatusKey(driverInstanceId, hostName), out var snapshot) ? snapshot : null;
|
||||
|
||||
/// <summary>Copy of every currently-tracked (instance, host, snapshot) triple. Safe under concurrent writes.</summary>
|
||||
public IReadOnlyList<(string DriverInstanceId, string HostName, ResilienceStatusSnapshot Snapshot)> Snapshot() =>
|
||||
_status.Select(kvp => (kvp.Key.DriverInstanceId, kvp.Key.HostName, kvp.Value)).ToList();
|
||||
|
||||
private readonly record struct StatusKey(string DriverInstanceId, string HostName);
|
||||
}
|
||||
|
||||
/// <summary>Snapshot of the resilience counters for one <c>(DriverInstanceId, HostName)</c> pair.</summary>
|
||||
public sealed record ResilienceStatusSnapshot
|
||||
{
|
||||
public int ConsecutiveFailures { get; init; }
|
||||
public DateTime? LastBreakerOpenUtc { get; init; }
|
||||
public DateTime? LastRecycleUtc { get; init; }
|
||||
public long BaselineFootprintBytes { get; init; }
|
||||
public long CurrentFootprintBytes { get; init; }
|
||||
public DateTime LastSampledUtc { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// In-flight capability calls against this (instance, host). Bumped on call entry +
|
||||
/// decremented on completion. Feeds <c>DriverInstanceResilienceStatus.CurrentBulkheadDepth</c>
|
||||
/// for Admin <c>/hosts</c> — a cheap proxy for the Polly bulkhead depth until the full
|
||||
/// telemetry observer lands.
|
||||
/// </summary>
|
||||
public int CurrentInFlight { get; init; }
|
||||
}
|
||||
Reference in New Issue
Block a user