chore: organize solution into module folders (Core/Server/Drivers/Client/Tooling)

Group all 69 projects into category subfolders under src/ and tests/ so the
Rider Solution Explorer mirrors the module structure. Folders: Core, Server,
Drivers (with a nested Driver CLIs subfolder), Client, Tooling.

- Move every project folder on disk with git mv (history preserved as renames).
- Recompute relative paths in 57 .csproj files: cross-category ProjectReferences,
  the lib/ HintPath+None refs in Driver.Historian.Wonderware, and the external
  mxaccessgw refs in Driver.Galaxy and its test project.
- Rebuild ZB.MOM.WW.OtOpcUa.slnx with nested solution folders.
- Re-prefix project paths in functional scripts (e2e, compliance, smoke SQL,
  integration, install).

Build green (0 errors); unit tests pass. Docs left for a separate pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-17 01:55:28 -04:00
parent 69f02fed7f
commit a25593a9c6
1044 changed files with 365 additions and 343 deletions

View File

@@ -0,0 +1,48 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Tri-state result of an <see cref="IPermissionEvaluator.Authorize"/> call, per decision
/// #149. Phase 6.2 only produces <see cref="AuthorizationVerdict.Allow"/> and
/// <see cref="AuthorizationVerdict.NotGranted"/>; the <see cref="AuthorizationVerdict.Denied"/>
/// variant exists in the model so v2.1 Explicit Deny lands without an API break. Provenance
/// carries the matched grants (or empty when not granted) for audit + the Admin UI "Probe
/// this permission" diagnostic.
/// </summary>
public sealed record AuthorizationDecision(
AuthorizationVerdict Verdict,
IReadOnlyList<MatchedGrant> Provenance)
{
public bool IsAllowed => Verdict == AuthorizationVerdict.Allow;
/// <summary>Convenience constructor for the common "no grants matched" outcome.</summary>
public static AuthorizationDecision NotGranted() => new(AuthorizationVerdict.NotGranted, []);
/// <summary>Allow with the list of grants that matched.</summary>
public static AuthorizationDecision Allowed(IReadOnlyList<MatchedGrant> provenance)
=> new(AuthorizationVerdict.Allow, provenance);
}
/// <summary>Three-valued authorization outcome.</summary>
public enum AuthorizationVerdict
{
/// <summary>At least one grant matches the requested (operation, scope) pair.</summary>
Allow,
/// <summary>No grant matches. Phase 6.2 default — treated as deny at the OPC UA surface.</summary>
NotGranted,
/// <summary>Explicit deny grant matched. Reserved for v2.1; never produced by Phase 6.2.</summary>
Denied,
}
/// <summary>One grant that contributed to an Allow verdict — for audit / UI diagnostics.</summary>
/// <param name="LdapGroup">LDAP group the matched grant belongs to.</param>
/// <param name="Scope">Where in the hierarchy the grant was anchored.</param>
/// <param name="PermissionFlags">The bitmask the grant contributed.</param>
public sealed record MatchedGrant(
string LdapGroup,
NodeAclScopeKind Scope,
NodePermissions PermissionFlags);

View File

@@ -0,0 +1,23 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Evaluates whether a session is authorized to perform an OPC UA <see cref="OpcUaOperation"/>
/// on the node addressed by a <see cref="NodeScope"/>. Phase 6.2 Stream B central surface.
/// </summary>
/// <remarks>
/// Data-plane only. Reads <c>NodeAcl</c> rows joined against the session's resolved LDAP
/// groups (via <see cref="UserAuthorizationState"/>). Must not depend on the control-plane
/// admin-role mapping table per decision #150 — the two concerns share zero runtime code.
/// </remarks>
public interface IPermissionEvaluator
{
/// <summary>
/// Authorize the requested operation for the session. Callers (<c>DriverNodeManager</c>
/// Read / Write / HistoryRead / Subscribe / Browse / Call dispatch) map their native
/// failure to <c>BadUserAccessDenied</c> per OPC UA Part 4 when the result is not
/// <see cref="AuthorizationVerdict.Allow"/>.
/// </summary>
AuthorizationDecision Authorize(UserAuthorizationState session, OpcUaOperation operation, NodeScope scope);
}

View File

@@ -0,0 +1,58 @@
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Address of a node in the 6-level scope hierarchy the Phase 6.2 evaluator walks.
/// Assembled by the dispatch layer from the node's namespace + UNS path + tag; passed
/// to <see cref="IPermissionEvaluator"/> which walks the matching trie path.
/// </summary>
/// <remarks>
/// <para>Per decision #129 and the Phase 6.2 Stream B plan the hierarchy is
/// <c>Cluster → Namespace → UnsArea → UnsLine → Equipment → Tag</c> for UNS
/// (Equipment-kind) namespaces. Galaxy (SystemPlatform-kind) namespaces instead use
/// <c>Cluster → Namespace → FolderSegment(s) → Tag</c>, and each folder segment takes
/// one trie level — so a deeply-nested Galaxy folder implicitly reaches the same
/// depth as a full UNS path.</para>
///
/// <para>Unset mid-path levels (e.g. a Cluster-scoped request with no UnsArea) leave
/// the corresponding id <c>null</c>. The evaluator walks as far as the scope goes +
/// stops at the first null.</para>
/// </remarks>
public sealed record NodeScope
{
/// <summary>Cluster the node belongs to. Required.</summary>
public required string ClusterId { get; init; }
/// <summary>Namespace within the cluster. Null is not allowed for a request against a real node.</summary>
public string? NamespaceId { get; init; }
/// <summary>For Equipment-kind namespaces: UNS area (e.g. "warsaw-west"). Null on Galaxy.</summary>
public string? UnsAreaId { get; init; }
/// <summary>For Equipment-kind namespaces: UNS line below the area. Null on Galaxy.</summary>
public string? UnsLineId { get; init; }
/// <summary>For Equipment-kind namespaces: equipment row below the line. Null on Galaxy.</summary>
public string? EquipmentId { get; init; }
/// <summary>
/// For Galaxy (SystemPlatform-kind) namespaces only: the folder path segments from
/// namespace root to the target tag, in order. Empty on Equipment namespaces.
/// </summary>
public IReadOnlyList<string> FolderSegments { get; init; } = [];
/// <summary>Target tag id when the scope addresses a specific tag; null for folder / equipment-level scopes.</summary>
public string? TagId { get; init; }
/// <summary>Which hierarchy applies — Equipment-kind (UNS) or SystemPlatform-kind (Galaxy).</summary>
public required NodeHierarchyKind Kind { get; init; }
}
/// <summary>Selector between the two scope-hierarchy shapes.</summary>
public enum NodeHierarchyKind
{
/// <summary><c>Cluster → Namespace → UnsArea → UnsLine → Equipment → Tag</c> — UNS / Equipment kind.</summary>
Equipment,
/// <summary><c>Cluster → Namespace → FolderSegment(s) → Tag</c> — Galaxy / SystemPlatform kind.</summary>
SystemPlatform,
}

View File

@@ -0,0 +1,125 @@
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// In-memory permission trie for one <c>(ClusterId, GenerationId)</c>. Walk from the cluster
/// root down through namespace → UNS levels (or folder segments) → tag, OR-ing the
/// <see cref="TrieGrant.PermissionFlags"/> granted at each visited level for each of the session's
/// LDAP groups. The accumulated bitmask is compared to the permission required by the
/// requested <see cref="Abstractions.OpcUaOperation"/>.
/// </summary>
/// <remarks>
/// Per decision #129 (additive grants, no explicit Deny in v2.0) the walk is pure union:
/// encountering a grant at any level contributes its flags, never revokes them. A grant at
/// the Cluster root therefore cascades to every tag below it; a grant at a deep equipment
/// leaf is visible only on that equipment subtree.
/// </remarks>
public sealed class PermissionTrie
{
/// <summary>Cluster this trie belongs to.</summary>
public required string ClusterId { get; init; }
/// <summary>Config generation the trie was built from — used by the cache for invalidation.</summary>
public required long GenerationId { get; init; }
/// <summary>Root of the trie. Level 0 (cluster-level grants) live directly here.</summary>
public PermissionTrieNode Root { get; init; } = new();
/// <summary>
/// Walk the trie collecting grants that apply to <paramref name="scope"/> for any of the
/// session's <paramref name="ldapGroups"/>. Returns the matched-grant list; the caller
/// OR-s the flag bits to decide whether the requested permission is carried.
/// </summary>
public IReadOnlyList<MatchedGrant> CollectMatches(NodeScope scope, IEnumerable<string> ldapGroups)
{
ArgumentNullException.ThrowIfNull(scope);
ArgumentNullException.ThrowIfNull(ldapGroups);
var groups = ldapGroups.ToHashSet(StringComparer.OrdinalIgnoreCase);
if (groups.Count == 0) return [];
var matches = new List<MatchedGrant>();
// Level 0 — cluster-scoped grants.
CollectAtLevel(Root, NodeAclScopeKind.Cluster, groups, matches);
// Level 1 — namespace.
if (scope.NamespaceId is null) return matches;
if (!Root.Children.TryGetValue(scope.NamespaceId, out var ns)) return matches;
CollectAtLevel(ns, NodeAclScopeKind.Namespace, groups, matches);
// Two hierarchies diverge below the namespace.
if (scope.Kind == NodeHierarchyKind.Equipment)
WalkEquipment(ns, scope, groups, matches);
else
WalkSystemPlatform(ns, scope, groups, matches);
return matches;
}
private static void WalkEquipment(PermissionTrieNode ns, NodeScope scope, HashSet<string> groups, List<MatchedGrant> matches)
{
if (scope.UnsAreaId is null) return;
if (!ns.Children.TryGetValue(scope.UnsAreaId, out var area)) return;
CollectAtLevel(area, NodeAclScopeKind.UnsArea, groups, matches);
if (scope.UnsLineId is null) return;
if (!area.Children.TryGetValue(scope.UnsLineId, out var line)) return;
CollectAtLevel(line, NodeAclScopeKind.UnsLine, groups, matches);
if (scope.EquipmentId is null) return;
if (!line.Children.TryGetValue(scope.EquipmentId, out var eq)) return;
CollectAtLevel(eq, NodeAclScopeKind.Equipment, groups, matches);
if (scope.TagId is null) return;
if (!eq.Children.TryGetValue(scope.TagId, out var tag)) return;
CollectAtLevel(tag, NodeAclScopeKind.Tag, groups, matches);
}
private static void WalkSystemPlatform(PermissionTrieNode ns, NodeScope scope, HashSet<string> groups, List<MatchedGrant> matches)
{
// FolderSegments are nested under the namespace; each is its own trie level. Reuse the
// UnsArea scope kind for the flags — NodeAcl rows for Galaxy tags carry ScopeKind.Tag
// for leaf grants and ScopeKind.Namespace for folder-root grants; deeper folder grants
// are modeled as Equipment-level rows today since NodeAclScopeKind doesn't enumerate
// a dedicated FolderSegment kind. Future-proof TODO tracked in Stream B follow-up.
var current = ns;
foreach (var segment in scope.FolderSegments)
{
if (!current.Children.TryGetValue(segment, out var child)) return;
CollectAtLevel(child, NodeAclScopeKind.Equipment, groups, matches);
current = child;
}
if (scope.TagId is null) return;
if (!current.Children.TryGetValue(scope.TagId, out var tag)) return;
CollectAtLevel(tag, NodeAclScopeKind.Tag, groups, matches);
}
private static void CollectAtLevel(PermissionTrieNode node, NodeAclScopeKind level, HashSet<string> groups, List<MatchedGrant> matches)
{
foreach (var grant in node.Grants)
{
if (groups.Contains(grant.LdapGroup))
matches.Add(new MatchedGrant(grant.LdapGroup, level, grant.PermissionFlags));
}
}
}
/// <summary>One node in a <see cref="PermissionTrie"/>.</summary>
public sealed class PermissionTrieNode
{
/// <summary>Grants anchored at this trie level.</summary>
public List<TrieGrant> Grants { get; } = [];
/// <summary>
/// Children keyed by the next level's id — namespace id under cluster; UnsAreaId or
/// folder-segment name under namespace; etc. Comparer is OrdinalIgnoreCase so the walk
/// tolerates case drift between the NodeAcl row and the requested scope.
/// </summary>
public Dictionary<string, PermissionTrieNode> Children { get; } = new(StringComparer.OrdinalIgnoreCase);
}
/// <summary>Projection of a <see cref="Configuration.Entities.NodeAcl"/> row into the trie.</summary>
public sealed record TrieGrant(string LdapGroup, NodePermissions PermissionFlags);

View File

@@ -0,0 +1,97 @@
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Builds a <see cref="PermissionTrie"/> from a set of <see cref="NodeAcl"/> rows anchored
/// in one generation. The trie is keyed on the rows' scope hierarchy — rows with
/// <see cref="NodeAclScopeKind.Cluster"/> land at the trie root, rows with
/// <see cref="NodeAclScopeKind.Tag"/> land at a leaf, etc.
/// </summary>
/// <remarks>
/// <para>Intended to be called by <see cref="PermissionTrieCache"/> once per published
/// generation; the resulting trie is immutable for the life of the cache entry. Idempotent —
/// two builds from the same rows produce equal tries (grant lists may be in insertion order;
/// evaluators don't depend on order).</para>
///
/// <para>The builder deliberately does not know about the node-row metadata the trie path
/// will be walked with. The caller assembles <see cref="NodeScope"/> values from the live
/// config (UnsArea parent of UnsLine, etc.); this class only honors the <c>ScopeId</c>
/// each row carries.</para>
/// </remarks>
public static class PermissionTrieBuilder
{
/// <summary>
/// Build a trie for one cluster/generation from the supplied rows. The caller is
/// responsible for pre-filtering rows to the target generation + cluster.
/// </summary>
public static PermissionTrie Build(
string clusterId,
long generationId,
IReadOnlyList<NodeAcl> rows,
IReadOnlyDictionary<string, NodeAclPath>? scopePaths = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
ArgumentNullException.ThrowIfNull(rows);
var trie = new PermissionTrie { ClusterId = clusterId, GenerationId = generationId };
foreach (var row in rows)
{
if (!string.Equals(row.ClusterId, clusterId, StringComparison.OrdinalIgnoreCase)) continue;
var grant = new TrieGrant(row.LdapGroup, row.PermissionFlags);
var node = row.ScopeKind switch
{
NodeAclScopeKind.Cluster => trie.Root,
_ => Descend(trie.Root, row, scopePaths),
};
if (node is not null)
node.Grants.Add(grant);
}
return trie;
}
private static PermissionTrieNode? Descend(PermissionTrieNode root, NodeAcl row, IReadOnlyDictionary<string, NodeAclPath>? scopePaths)
{
if (string.IsNullOrEmpty(row.ScopeId)) return null;
// For sub-cluster scopes the caller supplies a path lookup so we know the containing
// namespace / UnsArea / UnsLine ids. Without a path lookup we fall back to putting the
// row directly under the root using its ScopeId — works for deterministic tests, not
// for production where the hierarchy must be honored.
if (scopePaths is null || !scopePaths.TryGetValue(row.ScopeId, out var path))
{
return EnsureChild(root, row.ScopeId);
}
var node = root;
foreach (var segment in path.Segments)
node = EnsureChild(node, segment);
return node;
}
private static PermissionTrieNode EnsureChild(PermissionTrieNode parent, string key)
{
if (!parent.Children.TryGetValue(key, out var child))
{
child = new PermissionTrieNode();
parent.Children[key] = child;
}
return child;
}
}
/// <summary>
/// Ordered list of trie-path segments from root to the target node. Supplied to
/// <see cref="PermissionTrieBuilder.Build"/> so the builder knows where a
/// <see cref="NodeAclScopeKind.UnsLine"/>-scoped row sits in the hierarchy.
/// </summary>
/// <param name="Segments">
/// Namespace id, then (for Equipment kind) UnsAreaId / UnsLineId / EquipmentId / TagId as
/// applicable; or (for SystemPlatform kind) NamespaceId / FolderSegment / .../TagId.
/// </param>
public sealed record NodeAclPath(IReadOnlyList<string> Segments);

View File

@@ -0,0 +1,88 @@
using System.Collections.Concurrent;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Process-singleton cache of <see cref="PermissionTrie"/> instances keyed on
/// <c>(ClusterId, GenerationId)</c>. Hot-path evaluation reads
/// <see cref="GetTrie(string)"/> without awaiting DB access; the cache is populated
/// out-of-band on publish + on first reference via
/// <see cref="Install(PermissionTrie)"/>.
/// </summary>
/// <remarks>
/// Per decision #148 and Phase 6.2 Stream B.4 the cache is generation-sealed: once a
/// trie is installed for <c>(ClusterId, GenerationId)</c> the entry is immutable. When a
/// new generation publishes, the caller calls <see cref="Install"/> with the new trie
/// + the cache atomically updates its "current generation" pointer for that cluster.
/// Older generations are retained so an in-flight request evaluating the prior generation
/// still succeeds — GC via <see cref="Prune(string, int)"/>.
/// </remarks>
public sealed class PermissionTrieCache
{
private readonly ConcurrentDictionary<string, ClusterEntry> _byCluster =
new(StringComparer.OrdinalIgnoreCase);
/// <summary>Install a trie for a cluster + make it the current generation.</summary>
public void Install(PermissionTrie trie)
{
ArgumentNullException.ThrowIfNull(trie);
_byCluster.AddOrUpdate(trie.ClusterId,
_ => ClusterEntry.FromSingle(trie),
(_, existing) => existing.WithAdditional(trie));
}
/// <summary>Get the current-generation trie for a cluster; null when nothing installed.</summary>
public PermissionTrie? GetTrie(string clusterId)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
return _byCluster.TryGetValue(clusterId, out var entry) ? entry.Current : null;
}
/// <summary>Get a specific (cluster, generation) trie; null if that pair isn't cached.</summary>
public PermissionTrie? GetTrie(string clusterId, long generationId)
{
if (!_byCluster.TryGetValue(clusterId, out var entry)) return null;
return entry.Tries.TryGetValue(generationId, out var trie) ? trie : null;
}
/// <summary>The generation id the <see cref="GetTrie(string)"/> shortcut currently serves for a cluster.</summary>
public long? CurrentGenerationId(string clusterId)
=> _byCluster.TryGetValue(clusterId, out var entry) ? entry.Current.GenerationId : null;
/// <summary>Drop every cached trie for one cluster.</summary>
public void Invalidate(string clusterId) => _byCluster.TryRemove(clusterId, out _);
/// <summary>
/// Retain only the most-recent <paramref name="keepLatest"/> generations for a cluster.
/// No-op when there's nothing to drop.
/// </summary>
public void Prune(string clusterId, int keepLatest = 3)
{
if (keepLatest < 1) throw new ArgumentOutOfRangeException(nameof(keepLatest), keepLatest, "keepLatest must be >= 1");
if (!_byCluster.TryGetValue(clusterId, out var entry)) return;
if (entry.Tries.Count <= keepLatest) return;
var keep = entry.Tries
.OrderByDescending(kvp => kvp.Key)
.Take(keepLatest)
.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
_byCluster[clusterId] = new ClusterEntry(entry.Current, keep);
}
/// <summary>Diagnostics counter: number of cached (cluster, generation) tries.</summary>
public int CachedTrieCount => _byCluster.Values.Sum(e => e.Tries.Count);
private sealed record ClusterEntry(PermissionTrie Current, IReadOnlyDictionary<long, PermissionTrie> Tries)
{
public static ClusterEntry FromSingle(PermissionTrie trie) =>
new(trie, new Dictionary<long, PermissionTrie> { [trie.GenerationId] = trie });
public ClusterEntry WithAdditional(PermissionTrie trie)
{
var next = new Dictionary<long, PermissionTrie>(Tries) { [trie.GenerationId] = trie };
// The highest generation wins as "current" — handles out-of-order installs.
var current = trie.GenerationId >= Current.GenerationId ? trie : Current;
return new ClusterEntry(current, next);
}
}
}

View File

@@ -0,0 +1,70 @@
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Default <see cref="IPermissionEvaluator"/> implementation. Resolves the
/// <see cref="PermissionTrie"/> for the session's cluster (via
/// <see cref="PermissionTrieCache"/>), walks it collecting matched grants, OR-s the
/// permission flags, and maps against the operation-specific required permission.
/// </summary>
public sealed class TriePermissionEvaluator : IPermissionEvaluator
{
private readonly PermissionTrieCache _cache;
private readonly TimeProvider _timeProvider;
public TriePermissionEvaluator(PermissionTrieCache cache, TimeProvider? timeProvider = null)
{
ArgumentNullException.ThrowIfNull(cache);
_cache = cache;
_timeProvider = timeProvider ?? TimeProvider.System;
}
public AuthorizationDecision Authorize(UserAuthorizationState session, OpcUaOperation operation, NodeScope scope)
{
ArgumentNullException.ThrowIfNull(session);
ArgumentNullException.ThrowIfNull(scope);
// Decision #152 — beyond the staleness ceiling every call fails closed regardless of
// cache warmth elsewhere in the process.
if (session.IsStale(_timeProvider.GetUtcNow().UtcDateTime))
return AuthorizationDecision.NotGranted();
if (!string.Equals(session.ClusterId, scope.ClusterId, StringComparison.OrdinalIgnoreCase))
return AuthorizationDecision.NotGranted();
var trie = _cache.GetTrie(scope.ClusterId);
if (trie is null) return AuthorizationDecision.NotGranted();
var matches = trie.CollectMatches(scope, session.LdapGroups);
if (matches.Count == 0) return AuthorizationDecision.NotGranted();
var required = MapOperationToPermission(operation);
var granted = NodePermissions.None;
foreach (var m in matches) granted |= m.PermissionFlags;
return (granted & required) == required
? AuthorizationDecision.Allowed(matches)
: AuthorizationDecision.NotGranted();
}
/// <summary>Maps each <see cref="OpcUaOperation"/> to the <see cref="NodePermissions"/> bit required to grant it.</summary>
public static NodePermissions MapOperationToPermission(OpcUaOperation op) => op switch
{
OpcUaOperation.Browse => NodePermissions.Browse,
OpcUaOperation.Read => NodePermissions.Read,
OpcUaOperation.WriteOperate => NodePermissions.WriteOperate,
OpcUaOperation.WriteTune => NodePermissions.WriteTune,
OpcUaOperation.WriteConfigure => NodePermissions.WriteConfigure,
OpcUaOperation.HistoryRead => NodePermissions.HistoryRead,
OpcUaOperation.HistoryUpdate => NodePermissions.HistoryRead, // HistoryUpdate bit not yet in NodePermissions; TODO Stream C follow-up
OpcUaOperation.CreateMonitoredItems => NodePermissions.Subscribe,
OpcUaOperation.TransferSubscriptions=> NodePermissions.Subscribe,
OpcUaOperation.Call => NodePermissions.MethodCall,
OpcUaOperation.AlarmAcknowledge => NodePermissions.AlarmAcknowledge,
OpcUaOperation.AlarmConfirm => NodePermissions.AlarmConfirm,
OpcUaOperation.AlarmShelve => NodePermissions.AlarmShelve,
_ => throw new ArgumentOutOfRangeException(nameof(op), op, $"No permission mapping defined for operation {op}."),
};
}

View File

@@ -0,0 +1,69 @@
namespace ZB.MOM.WW.OtOpcUa.Core.Authorization;
/// <summary>
/// Per-session authorization state cached on the OPC UA session object + keyed on the
/// session id. Captures the LDAP group memberships resolved at sign-in, the generation
/// the membership was resolved against, and the bounded freshness window.
/// </summary>
/// <remarks>
/// Per decision #151 the membership is bounded by <see cref="MembershipFreshnessInterval"/>
/// (default 15 min). After that, the next hot-path authz call re-resolves LDAP group
/// memberships; failure to re-resolve (LDAP unreachable) flips the session to fail-closed
/// until a refresh succeeds.
///
/// Per decision #152 <see cref="AuthCacheMaxStaleness"/> (default 5 min) is separate from
/// Phase 6.1's availability-oriented 24h cache — beyond this window the evaluator returns
/// <see cref="AuthorizationVerdict.NotGranted"/> regardless of config-cache warmth.
/// </remarks>
public sealed record UserAuthorizationState
{
/// <summary>Opaque session id (reuse OPC UA session handle when possible).</summary>
public required string SessionId { get; init; }
/// <summary>Cluster the session is scoped to — every request targets nodes in this cluster.</summary>
public required string ClusterId { get; init; }
/// <summary>
/// LDAP groups the user is a member of as resolved at sign-in / last membership refresh.
/// Case comparison is handled downstream by the evaluator (OrdinalIgnoreCase).
/// </summary>
public required IReadOnlyList<string> LdapGroups { get; init; }
/// <summary>Timestamp when <see cref="LdapGroups"/> was last resolved from the directory.</summary>
public required DateTime MembershipResolvedUtc { get; init; }
/// <summary>
/// Trie generation the session is currently bound to. When
/// <see cref="PermissionTrieCache"/> moves to a new generation, the session's
/// <c>(AuthGenerationId, MembershipVersion)</c> stamp no longer matches its
/// MonitoredItems and they re-evaluate on next publish (decision #153).
/// </summary>
public required long AuthGenerationId { get; init; }
/// <summary>
/// Monotonic counter incremented every time membership is re-resolved. Combined with
/// <see cref="AuthGenerationId"/> into the subscription stamp per decision #153.
/// </summary>
public required long MembershipVersion { get; init; }
/// <summary>Bounded membership freshness window; past this the next authz call refreshes.</summary>
public TimeSpan MembershipFreshnessInterval { get; init; } = TimeSpan.FromMinutes(15);
/// <summary>Hard staleness ceiling — beyond this, the evaluator fails closed.</summary>
public TimeSpan AuthCacheMaxStaleness { get; init; } = TimeSpan.FromMinutes(5);
/// <summary>
/// True when <paramref name="utcNow"/> - <see cref="MembershipResolvedUtc"/> exceeds
/// <see cref="AuthCacheMaxStaleness"/>. The evaluator short-circuits to NotGranted
/// whenever this is true.
/// </summary>
public bool IsStale(DateTime utcNow) => utcNow - MembershipResolvedUtc > AuthCacheMaxStaleness;
/// <summary>
/// True when membership is past its freshness interval but still within the staleness
/// ceiling — a signal to the caller to kick off an async refresh, while the current
/// call still evaluates against the cached memberships.
/// </summary>
public bool NeedsRefresh(DateTime utcNow) =>
!IsStale(utcNow) && utcNow - MembershipResolvedUtc > MembershipFreshnessInterval;
}

View File

@@ -0,0 +1,86 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Hosting;
/// <summary>
/// Process-singleton registry of <see cref="IDriver"/> factories keyed by
/// <c>DriverInstance.DriverType</c> string. Each driver project ships a DI
/// extension (e.g. <c>services.AddGalaxyProxyDriverFactory()</c>) that registers
/// its factory at startup; the bootstrapper looks up the factory by
/// <c>DriverInstance.DriverType</c> + invokes it with the row's
/// <c>DriverInstanceId</c> + <c>DriverConfig</c> JSON.
/// </summary>
/// <remarks>
/// Closes the gap surfaced by task #240 live smoke — DriverInstance rows in
/// the central config DB had no path to materialise as registered <see cref="IDriver"/>
/// instances. The factory registry is the seam.
/// </remarks>
public sealed class DriverFactoryRegistry
{
private readonly Dictionary<string, Func<string, string, IDriver>> _factories
= new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, DriverTier> _tiers
= new(StringComparer.OrdinalIgnoreCase);
private readonly object _lock = new();
/// <summary>
/// Register a factory for <paramref name="driverType"/>. Throws if a factory is
/// already registered for that type — drivers are singletons by type-name in
/// this process.
/// </summary>
/// <param name="driverType">Matches <c>DriverInstance.DriverType</c>.</param>
/// <param name="factory">
/// Receives <c>(driverInstanceId, driverConfigJson)</c>; returns a new
/// <see cref="IDriver"/>. Must NOT call <see cref="IDriver.InitializeAsync"/>
/// itself — the bootstrapper calls it via <see cref="DriverHost.RegisterAsync"/>
/// so the host's per-driver retry semantics apply uniformly.
/// </param>
/// <param name="tier">
/// Stability tier per <c>docs/v2/driver-stability.md</c>. Defaults to
/// <see cref="DriverTier.A"/> (in-process, lowest recycle footprint); Tier C
/// drivers SHOULD pass this explicitly so the scheduled-recycle wiring in
/// <c>DriverInstanceBootstrapper</c> can skip in-process drivers by tier check
/// rather than by driver-type allow-list.
/// </param>
public void Register(string driverType, Func<string, string, IDriver> factory,
DriverTier tier = DriverTier.A)
{
ArgumentException.ThrowIfNullOrWhiteSpace(driverType);
ArgumentNullException.ThrowIfNull(factory);
lock (_lock)
{
if (_factories.ContainsKey(driverType))
throw new InvalidOperationException(
$"DriverType '{driverType}' factory already registered for this process");
_factories[driverType] = factory;
_tiers[driverType] = tier;
}
}
/// <summary>
/// Try to look up the factory for <paramref name="driverType"/>. Returns null
/// if no driver assembly registered one — bootstrapper logs + skips so a
/// missing-assembly deployment doesn't take down the whole server.
/// </summary>
public Func<string, string, IDriver>? TryGet(string driverType)
{
ArgumentException.ThrowIfNullOrWhiteSpace(driverType);
lock (_lock) return _factories.GetValueOrDefault(driverType);
}
/// <summary>
/// Look up the tier recorded alongside the factory. Returns <see cref="DriverTier.A"/>
/// for unknown driver types — a missing registration is already a skipped-bootstrap
/// case upstream; we don't double-surface that failure here.
/// </summary>
public DriverTier GetTier(string driverType)
{
ArgumentException.ThrowIfNullOrWhiteSpace(driverType);
lock (_lock) return _tiers.GetValueOrDefault(driverType, DriverTier.A);
}
public IReadOnlyCollection<string> RegisteredTypes
{
get { lock (_lock) return [.. _factories.Keys]; }
}
}

View File

@@ -0,0 +1,91 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Hosting;
/// <summary>
/// Process-local registry + lifecycle manager for loaded <see cref="IDriver"/> instances
/// (decision #65). Phase 1 scaffold — per-process isolation for Tier C drivers (Galaxy, FOCAS)
/// is implemented in Phase 2 via named-pipe RPC; this class handles in-process drivers today
/// and exposes the same registration interface so the Tier C wrapper can slot in later.
/// </summary>
public sealed class DriverHost : IAsyncDisposable
{
private readonly Dictionary<string, IDriver> _drivers = new();
private readonly object _lock = new();
public IReadOnlyCollection<string> RegisteredDriverIds
{
get { lock (_lock) return [.. _drivers.Keys]; }
}
public DriverHealth? GetHealth(string driverInstanceId)
{
lock (_lock)
return _drivers.TryGetValue(driverInstanceId, out var d) ? d.GetHealth() : null;
}
/// <summary>
/// Look up a registered driver by instance id. Used by the OPC UA server runtime
/// (<c>OtOpcUaServer</c>) to instantiate one <c>DriverNodeManager</c> per driver at
/// startup. Returns null when the driver is not registered.
/// </summary>
public IDriver? GetDriver(string driverInstanceId)
{
lock (_lock)
return _drivers.TryGetValue(driverInstanceId, out var d) ? d : null;
}
/// <summary>
/// Registers the driver and calls <see cref="IDriver.InitializeAsync"/>. If initialization
/// throws, the driver is kept in the registry so the operator can retry; quality on its
/// nodes will reflect <see cref="DriverState.Faulted"/> until <c>Reinitialize</c> succeeds.
/// </summary>
public async Task RegisterAsync(IDriver driver, string driverConfigJson, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(driver);
var id = driver.DriverInstanceId;
lock (_lock)
{
if (_drivers.ContainsKey(id))
throw new InvalidOperationException($"Driver '{id}' is already registered.");
_drivers[id] = driver;
}
try { await driver.InitializeAsync(driverConfigJson, ct); }
catch
{
// Keep the driver registered — operator will see Faulted state and can reinitialize.
throw;
}
}
public async Task UnregisterAsync(string driverInstanceId, CancellationToken ct)
{
IDriver? driver;
lock (_lock)
{
if (!_drivers.TryGetValue(driverInstanceId, out driver)) return;
_drivers.Remove(driverInstanceId);
}
try { await driver.ShutdownAsync(ct); }
catch { /* shutdown is best-effort; logs elsewhere */ }
}
public async ValueTask DisposeAsync()
{
List<IDriver> snapshot;
lock (_lock)
{
snapshot = [.. _drivers.Values];
_drivers.Clear();
}
foreach (var driver in snapshot)
{
try { await driver.ShutdownAsync(CancellationToken.None); } catch { /* ignore */ }
(driver as IDisposable)?.Dispose();
}
}
}

View File

@@ -0,0 +1,86 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
/// <summary>
/// Domain-layer health aggregation for Phase 6.1 Stream C. Pure functions over the driver
/// fleet — given each driver's <see cref="DriverState"/>, produce a <see cref="ReadinessVerdict"/>
/// that maps to HTTP status codes at the endpoint layer.
/// </summary>
/// <remarks>
/// State matrix per <c>docs/v2/implementation/phase-6-1-resilience-and-observability.md</c>
/// §Stream C.1:
/// <list type="bullet">
/// <item><see cref="DriverState.Unknown"/> / <see cref="DriverState.Initializing"/>
/// → /readyz 503 (not yet ready).</item>
/// <item><see cref="DriverState.Healthy"/> → /readyz 200.</item>
/// <item><see cref="DriverState.Degraded"/> → /readyz 200 with flagged driver IDs.</item>
/// <item><see cref="DriverState.Faulted"/> → /readyz 503.</item>
/// </list>
/// The overall verdict is computed across the fleet: any Faulted → Faulted; any
/// Unknown/Initializing → NotReady; any Degraded → Degraded; else Healthy. An empty fleet
/// is Healthy (nothing to degrade).
/// </remarks>
public static class DriverHealthReport
{
/// <summary>Compute the fleet-wide readiness verdict from per-driver states.</summary>
public static ReadinessVerdict Aggregate(IReadOnlyList<DriverHealthSnapshot> drivers)
{
ArgumentNullException.ThrowIfNull(drivers);
if (drivers.Count == 0) return ReadinessVerdict.Healthy;
var anyFaulted = drivers.Any(d => d.State == DriverState.Faulted);
if (anyFaulted) return ReadinessVerdict.Faulted;
var anyInitializing = drivers.Any(d =>
d.State == DriverState.Unknown || d.State == DriverState.Initializing);
if (anyInitializing) return ReadinessVerdict.NotReady;
// Reconnecting = driver alive but not serving live data; report as Degraded so /readyz
// stays 200 (the fleet can still serve cached / last-good data) while operators see the
// affected driver in the body.
var anyDegraded = drivers.Any(d =>
d.State == DriverState.Degraded || d.State == DriverState.Reconnecting);
if (anyDegraded) return ReadinessVerdict.Degraded;
return ReadinessVerdict.Healthy;
}
/// <summary>
/// Map a <see cref="ReadinessVerdict"/> to the HTTP status the /readyz endpoint should
/// return per the Stream C.1 state matrix.
/// </summary>
public static int HttpStatus(ReadinessVerdict verdict) => verdict switch
{
ReadinessVerdict.Healthy => 200,
ReadinessVerdict.Degraded => 200,
ReadinessVerdict.NotReady => 503,
ReadinessVerdict.Faulted => 503,
_ => 500,
};
}
/// <summary>Per-driver snapshot fed into <see cref="DriverHealthReport.Aggregate"/>.</summary>
/// <param name="DriverInstanceId">Driver instance identifier (from <c>IDriver.DriverInstanceId</c>).</param>
/// <param name="State">Current <see cref="DriverState"/> from <c>IDriver.GetHealth</c>.</param>
/// <param name="DetailMessage">Optional driver-supplied detail (e.g. "primary PLC unreachable").</param>
public sealed record DriverHealthSnapshot(
string DriverInstanceId,
DriverState State,
string? DetailMessage = null);
/// <summary>Overall fleet readiness — derived from driver states by <see cref="DriverHealthReport.Aggregate"/>.</summary>
public enum ReadinessVerdict
{
/// <summary>All drivers Healthy (or fleet is empty).</summary>
Healthy,
/// <summary>At least one driver Degraded; none Faulted / NotReady.</summary>
Degraded,
/// <summary>At least one driver Unknown / Initializing; none Faulted.</summary>
NotReady,
/// <summary>At least one driver Faulted.</summary>
Faulted,
}

View File

@@ -0,0 +1,53 @@
using Serilog.Context;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
/// <summary>
/// Convenience wrapper around Serilog <see cref="LogContext"/> — attaches the set of
/// structured properties a capability call should carry (DriverInstanceId, DriverType,
/// CapabilityName, CorrelationId). Callers wrap their call-site body in a <c>using</c>
/// block; inner <c>Log.Information</c> / <c>Log.Warning</c> calls emit the context
/// automatically via the Serilog enricher chain.
/// </summary>
/// <remarks>
/// Per <c>docs/v2/implementation/phase-6-1-resilience-and-observability.md</c> §Stream C.2.
/// The correlation ID should be the OPC UA <c>RequestHeader.RequestHandle</c> when in-flight;
/// otherwise a short random GUID. Callers supply whichever is available.
/// </remarks>
public static class LogContextEnricher
{
/// <summary>Attach the capability-call property set. Dispose the returned scope to pop.</summary>
public static IDisposable Push(string driverInstanceId, string driverType, DriverCapability capability, string correlationId)
{
ArgumentException.ThrowIfNullOrWhiteSpace(driverInstanceId);
ArgumentException.ThrowIfNullOrWhiteSpace(driverType);
ArgumentException.ThrowIfNullOrWhiteSpace(correlationId);
var a = LogContext.PushProperty("DriverInstanceId", driverInstanceId);
var b = LogContext.PushProperty("DriverType", driverType);
var c = LogContext.PushProperty("CapabilityName", capability.ToString());
var d = LogContext.PushProperty("CorrelationId", correlationId);
return new CompositeScope(a, b, c, d);
}
/// <summary>
/// Generate a short correlation ID when no OPC UA RequestHandle is available.
/// 12-hex-char slice of a GUID — long enough for log correlation, short enough to
/// scan visually.
/// </summary>
public static string NewCorrelationId() => Guid.NewGuid().ToString("N")[..12];
private sealed class CompositeScope : IDisposable
{
private readonly IDisposable[] _inner;
public CompositeScope(params IDisposable[] inner) => _inner = inner;
public void Dispose()
{
// Reverse-order disposal matches Serilog's stack semantics.
for (var i = _inner.Length - 1; i >= 0; i--)
_inner[i].Dispose();
}
}
}

View File

@@ -0,0 +1,275 @@
using System.Text.Json;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.OpcUa;
/// <summary>
/// Materializes the canonical Unified Namespace browse tree for an Equipment-kind
/// <see cref="Configuration.Entities.Namespace"/> from the Config DB's
/// <c>UnsArea</c> / <c>UnsLine</c> / <c>Equipment</c> / <c>Tag</c> rows. Runs during
/// address-space build per <see cref="IDriver"/> whose
/// <c>Namespace.Kind = Equipment</c>; SystemPlatform-kind namespaces (Galaxy) are
/// exempt per decision #120 and reach this walker only indirectly through
/// <see cref="ITagDiscovery.DiscoverAsync"/>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Composition strategy.</b> ADR-001 (2026-04-20) accepted Option A — Config
/// primary. The walker treats the supplied <see cref="EquipmentNamespaceContent"/>
/// snapshot as the authoritative published surface. Every Equipment row becomes a
/// folder node at the UNS level-5 segment; every <see cref="Tag"/> bound to an
/// Equipment (non-null <see cref="Tag.EquipmentId"/>) becomes a variable node under
/// it. Driver-discovered tags that have no Config-DB row are not added by this
/// walker — the ITagDiscovery path continues to exist for the SystemPlatform case +
/// for enrichment, but Equipment-kind composition is fully Tag-row-driven.
/// </para>
///
/// <para>
/// <b>Under each Equipment node.</b> Five identifier properties per decision #121
/// (<c>EquipmentId</c>, <c>EquipmentUuid</c>, <c>MachineCode</c>, <c>ZTag</c>,
/// <c>SAPID</c>) are added as OPC UA properties — external systems (ERP, SAP PM)
/// resolve equipment by whichever identifier they natively use without a sidecar.
/// <see cref="IdentificationFolderBuilder.Build"/> materializes the OPC 40010
/// Identification sub-folder with the nine decision-#139 fields when at least one
/// is non-null; when all nine are null the sub-folder is omitted rather than
/// appearing empty.
/// </para>
///
/// <para>
/// <b>Address resolution.</b> Variable nodes carry the driver-side full reference
/// in <see cref="DriverAttributeInfo.FullName"/> copied from <c>Tag.TagConfig</c>
/// (the wire-level address JSON blob whose interpretation is driver-specific). At
/// runtime the dispatch layer routes Read/Write calls through the configured
/// capability invoker; an unreachable address surfaces as an OPC UA Bad status via
/// the natural driver-read failure path, NOT as a build-time reject. The ADR calls
/// this "BadNotFound placeholder" behavior — legible to operators via their Admin
/// UI + OPC UA client inspection of node status.
/// </para>
///
/// <para>
/// <b>Pure function.</b> This class has no dependency on the OPC UA SDK, no
/// Config-DB access, no state. It consumes pre-loaded EF Core rows + streams calls
/// into the supplied <see cref="IAddressSpaceBuilder"/>. The server-side wiring
/// (load snapshot → invoke walker → per-tag capability probe) lives in the Task B
/// PR alongside <c>NodeScopeResolver</c>'s Config-DB join.
/// </para>
/// </remarks>
public static class EquipmentNodeWalker
{
/// <summary>
/// Walk <paramref name="content"/> into <paramref name="namespaceBuilder"/>.
/// The builder is scoped to the Equipment-kind namespace root; the walker emits
/// Area → Line → Equipment folders under it, then identifier properties + the
/// Identification sub-folder + variable nodes per bound Tag under each Equipment.
/// </summary>
/// <param name="namespaceBuilder">
/// The builder scoped to the Equipment-kind namespace root. Caller is responsible for
/// creating this (e.g. <c>rootBuilder.Folder(namespace.NamespaceId, namespace.NamespaceUri)</c>).
/// </param>
/// <param name="content">Pre-loaded + pre-filtered rows for a single published generation.</param>
public static void Walk(IAddressSpaceBuilder namespaceBuilder, EquipmentNamespaceContent content)
{
ArgumentNullException.ThrowIfNull(namespaceBuilder);
ArgumentNullException.ThrowIfNull(content);
// Group lines by area + equipment by line + tags by equipment up-front. Avoids an
// O(N·M) re-scan at each UNS level on large fleets.
var linesByArea = content.Lines
.GroupBy(l => l.UnsAreaId, StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderBy(l => l.Name, StringComparer.Ordinal).ToList(), StringComparer.OrdinalIgnoreCase);
var equipmentByLine = content.Equipment
.GroupBy(e => e.UnsLineId, StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderBy(e => e.Name, StringComparer.Ordinal).ToList(), StringComparer.OrdinalIgnoreCase);
var tagsByEquipment = content.Tags
.Where(t => !string.IsNullOrEmpty(t.EquipmentId))
.GroupBy(t => t.EquipmentId!, StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderBy(t => t.Name, StringComparer.Ordinal).ToList(), StringComparer.OrdinalIgnoreCase);
var virtualTagsByEquipment = (content.VirtualTags ?? [])
.Where(v => v.Enabled)
.GroupBy(v => v.EquipmentId, StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderBy(v => v.Name, StringComparer.Ordinal).ToList(), StringComparer.OrdinalIgnoreCase);
var scriptedAlarmsByEquipment = (content.ScriptedAlarms ?? [])
.Where(a => a.Enabled)
.GroupBy(a => a.EquipmentId, StringComparer.OrdinalIgnoreCase)
.ToDictionary(g => g.Key, g => g.OrderBy(a => a.Name, StringComparer.Ordinal).ToList(), StringComparer.OrdinalIgnoreCase);
foreach (var area in content.Areas.OrderBy(a => a.Name, StringComparer.Ordinal))
{
var areaBuilder = namespaceBuilder.Folder(area.Name, area.Name);
if (!linesByArea.TryGetValue(area.UnsAreaId, out var areaLines)) continue;
foreach (var line in areaLines)
{
var lineBuilder = areaBuilder.Folder(line.Name, line.Name);
if (!equipmentByLine.TryGetValue(line.UnsLineId, out var lineEquipment)) continue;
foreach (var equipment in lineEquipment)
{
var equipmentBuilder = lineBuilder.Folder(equipment.Name, equipment.Name);
AddIdentifierProperties(equipmentBuilder, equipment);
IdentificationFolderBuilder.Build(equipmentBuilder, equipment);
if (tagsByEquipment.TryGetValue(equipment.EquipmentId, out var equipmentTags))
foreach (var tag in equipmentTags)
AddTagVariable(equipmentBuilder, tag);
if (virtualTagsByEquipment.TryGetValue(equipment.EquipmentId, out var vTags))
foreach (var vtag in vTags)
AddVirtualTagVariable(equipmentBuilder, vtag);
if (scriptedAlarmsByEquipment.TryGetValue(equipment.EquipmentId, out var alarms))
foreach (var alarm in alarms)
AddScriptedAlarmVariable(equipmentBuilder, alarm);
}
}
}
}
/// <summary>
/// Adds the five operator-facing identifiers from decision #121 as OPC UA properties
/// on the Equipment node. EquipmentId + EquipmentUuid are always populated;
/// MachineCode is required per <see cref="Equipment"/>; ZTag + SAPID are nullable in
/// the data model so they're skipped when null to avoid empty-string noise in the
/// browse tree.
/// </summary>
private static void AddIdentifierProperties(IAddressSpaceBuilder equipmentBuilder, Equipment equipment)
{
equipmentBuilder.AddProperty("EquipmentId", DriverDataType.String, equipment.EquipmentId);
equipmentBuilder.AddProperty("EquipmentUuid", DriverDataType.String, equipment.EquipmentUuid.ToString());
equipmentBuilder.AddProperty("MachineCode", DriverDataType.String, equipment.MachineCode);
if (!string.IsNullOrEmpty(equipment.ZTag))
equipmentBuilder.AddProperty("ZTag", DriverDataType.String, equipment.ZTag);
if (!string.IsNullOrEmpty(equipment.SAPID))
equipmentBuilder.AddProperty("SAPID", DriverDataType.String, equipment.SAPID);
}
/// <summary>
/// Emit a single Tag row as an <see cref="IAddressSpaceBuilder.Variable"/>. The driver
/// full reference lives in <c>Tag.TagConfig</c> (wire-level address, driver-specific
/// JSON blob); the variable node's data type derives from <c>Tag.DataType</c>.
/// Unreachable-address behavior per ADR-001 Option A: the variable is created; the
/// driver's natural Read failure surfaces an OPC UA Bad status at runtime.
/// </summary>
private static void AddTagVariable(IAddressSpaceBuilder equipmentBuilder, Tag tag)
{
var attr = new DriverAttributeInfo(
FullName: ExtractFullName(tag.TagConfig),
DriverDataType: ParseDriverDataType(tag.DataType),
IsArray: false,
ArrayDim: null,
SecurityClass: SecurityClassification.FreeAccess,
IsHistorized: false);
equipmentBuilder.Variable(tag.Name, tag.Name, attr);
}
/// <summary>
/// Cross-driver TagConfig convention — the Config DB's <c>CK_Tag_TagConfig_IsJson</c>
/// check constraint requires TagConfig to be a JSON object, and every shipped driver
/// (Galaxy / Modbus / AB CIP / S7 / FOCAS / TwinCAT / ABLegacy) stores the wire-level
/// address in a top-level <c>FullName</c> field. Extracting it here keeps the walker
/// driver-agnostic while giving the driver the plain address string its backend
/// expects at read-time — the raw JSON would otherwise be passed verbatim to
/// <c>IReadable.ReadAsync</c> and the driver would fail to resolve the tag.
/// </summary>
/// <remarks>
/// Falls back to the raw <paramref name="tagConfig"/> if it doesn't parse as JSON or
/// the <c>FullName</c> field is absent. This preserves the pre-refactor behaviour for
/// any legacy row that slipped past the check constraint or any future driver that
/// wants an opaque non-JSON reference.
/// </remarks>
internal static string ExtractFullName(string tagConfig)
{
if (string.IsNullOrWhiteSpace(tagConfig)) return tagConfig;
try
{
using var doc = JsonDocument.Parse(tagConfig);
if (doc.RootElement.ValueKind == JsonValueKind.Object
&& doc.RootElement.TryGetProperty("FullName", out var fullName)
&& fullName.ValueKind == JsonValueKind.String)
{
return fullName.GetString() ?? tagConfig;
}
}
catch (JsonException) { /* fall through */ }
return tagConfig;
}
/// <summary>
/// Parse <see cref="Tag.DataType"/> (stored as the <see cref="DriverDataType"/> enum
/// name string, decision #138) into the enum value. Unknown names fall back to
/// <see cref="DriverDataType.String"/> so a one-off driver-specific type doesn't
/// abort the whole walk; the underlying driver still sees the original TagConfig
/// address + can surface its own typed value via the OPC UA variant at read time.
/// </summary>
private static DriverDataType ParseDriverDataType(string raw) =>
Enum.TryParse<DriverDataType>(raw, ignoreCase: true, out var parsed) ? parsed : DriverDataType.String;
/// <summary>
/// Emit a <see cref="VirtualTag"/> row as a <see cref="NodeSourceKind.Virtual"/>
/// variable node. <c>FullName</c> doubles as the UNS path Phase 7's VirtualTagEngine
/// addresses its engine-side entries by. The <c>VirtualTagId</c> discriminator lets
/// the DriverNodeManager dispatch Reads/Subscribes to the engine rather than any
/// driver.
/// </summary>
private static void AddVirtualTagVariable(IAddressSpaceBuilder equipmentBuilder, VirtualTag vtag)
{
var attr = new DriverAttributeInfo(
FullName: vtag.VirtualTagId,
DriverDataType: ParseDriverDataType(vtag.DataType),
IsArray: false,
ArrayDim: null,
SecurityClass: SecurityClassification.FreeAccess,
IsHistorized: vtag.Historize,
IsAlarm: false,
WriteIdempotent: false,
Source: NodeSourceKind.Virtual,
VirtualTagId: vtag.VirtualTagId,
ScriptedAlarmId: null);
equipmentBuilder.Variable(vtag.Name, vtag.Name, attr);
}
/// <summary>
/// Emit a <see cref="ScriptedAlarm"/> row as a <see cref="NodeSourceKind.ScriptedAlarm"/>
/// variable node. The OPC UA Part 9 alarm-condition materialization happens at the
/// node-manager level (which wires the concrete <c>AlarmConditionState</c> subclass
/// per <see cref="ScriptedAlarm.AlarmType"/>); this walker provides the browse-level
/// anchor + the <see cref="DriverAttributeInfo.IsAlarm"/> flag that triggers that
/// materialization path.
/// </summary>
private static void AddScriptedAlarmVariable(IAddressSpaceBuilder equipmentBuilder, ScriptedAlarm alarm)
{
var attr = new DriverAttributeInfo(
FullName: alarm.ScriptedAlarmId,
DriverDataType: DriverDataType.Boolean,
IsArray: false,
ArrayDim: null,
SecurityClass: SecurityClassification.FreeAccess,
IsHistorized: false,
IsAlarm: true,
WriteIdempotent: false,
Source: NodeSourceKind.ScriptedAlarm,
VirtualTagId: null,
ScriptedAlarmId: alarm.ScriptedAlarmId);
equipmentBuilder.Variable(alarm.Name, alarm.Name, attr);
}
}
/// <summary>
/// Pre-loaded + pre-filtered snapshot of one Equipment-kind namespace's worth of Config
/// DB rows. All four collections are scoped to the same
/// <see cref="Configuration.Entities.ConfigGeneration"/> + the same
/// <see cref="Configuration.Entities.Namespace"/> row. The walker assumes this filter
/// was applied by the caller + does no cross-generation or cross-namespace validation.
/// </summary>
public sealed record EquipmentNamespaceContent(
IReadOnlyList<UnsArea> Areas,
IReadOnlyList<UnsLine> Lines,
IReadOnlyList<Equipment> Equipment,
IReadOnlyList<Tag> Tags,
IReadOnlyList<VirtualTag>? VirtualTags = null,
IReadOnlyList<ScriptedAlarm>? ScriptedAlarms = null);

View File

@@ -0,0 +1,118 @@
using System.Collections.Concurrent;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.OpcUa;
/// <summary>
/// Generic, driver-agnostic backbone for populating the OPC UA address space from an
/// <see cref="IDriver"/>. Walks the driver's discovery, wires the alarm + data-change +
/// rediscovery subscription events, and hands each variable to the supplied
/// <see cref="IAddressSpaceBuilder"/>. Concrete OPC UA server implementations provide the
/// builder — see the Server project's <c>OpcUaAddressSpaceBuilder</c> for the materialization
/// against <c>CustomNodeManager2</c>.
/// </summary>
/// <remarks>
/// Per <c>docs/v2/plan.md</c> decision #52 + #62 — Core owns the node tree, drivers stream
/// <c>Folder</c>/<c>Variable</c> calls, alarm-bearing variables are annotated via
/// <see cref="IVariableHandle.MarkAsAlarmCondition"/> and subsequent
/// <see cref="IAlarmSource.OnAlarmEvent"/> payloads route to the sink the builder returned.
/// </remarks>
public class GenericDriverNodeManager(IDriver driver) : IDisposable
{
protected IDriver Driver { get; } = driver ?? throw new ArgumentNullException(nameof(driver));
public string DriverInstanceId => Driver.DriverInstanceId;
// Source tag (DriverAttributeInfo.FullName) → alarm-condition sink. Populated during
// BuildAddressSpaceAsync by a recording IAddressSpaceBuilder implementation that captures the
// IVariableHandle per attr.IsAlarm=true variable and calls MarkAsAlarmCondition.
private readonly ConcurrentDictionary<string, IAlarmConditionSink> _alarmSinks =
new(StringComparer.OrdinalIgnoreCase);
private EventHandler<AlarmEventArgs>? _alarmForwarder;
private bool _disposed;
/// <summary>
/// Populates the address space by streaming nodes from the driver into the supplied builder,
/// wraps the builder so alarm-condition sinks are captured, subscribes to the driver's
/// alarm event stream, and routes each transition to the matching sink by <c>SourceNodeId</c>.
/// Driver exceptions are isolated per decision #12 — the driver's subtree is marked Faulted,
/// but other drivers remain available.
/// </summary>
public async Task BuildAddressSpaceAsync(IAddressSpaceBuilder builder, CancellationToken ct)
{
ArgumentNullException.ThrowIfNull(builder);
if (Driver is not ITagDiscovery discovery)
throw new NotSupportedException($"Driver '{Driver.DriverInstanceId}' does not implement ITagDiscovery.");
var capturing = new CapturingBuilder(builder, _alarmSinks);
await discovery.DiscoverAsync(capturing, ct);
if (Driver is IAlarmSource alarmSource)
{
_alarmForwarder = (_, e) =>
{
// Route the alarm to the sink registered for the originating variable, if any.
// Unknown source ids are dropped silently — they may belong to another driver or
// to a variable the builder chose not to flag as an alarm condition.
if (_alarmSinks.TryGetValue(e.SourceNodeId, out var sink))
sink.OnTransition(e);
};
alarmSource.OnAlarmEvent += _alarmForwarder;
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
if (_alarmForwarder is not null && Driver is IAlarmSource alarmSource)
{
alarmSource.OnAlarmEvent -= _alarmForwarder;
}
_alarmSinks.Clear();
}
/// <summary>
/// Snapshot the current alarm-sink registry by source node id. Diagnostic + test hook;
/// not part of the hot path.
/// </summary>
internal IReadOnlyCollection<string> TrackedAlarmSources => _alarmSinks.Keys.ToList();
/// <summary>
/// Wraps the caller-supplied <see cref="IAddressSpaceBuilder"/> so every
/// <see cref="IVariableHandle.MarkAsAlarmCondition"/> call registers the returned sink in
/// the node manager's source-node-id map. The builder itself drives materialization;
/// this wrapper only observes.
/// </summary>
private sealed class CapturingBuilder(
IAddressSpaceBuilder inner,
ConcurrentDictionary<string, IAlarmConditionSink> sinks) : IAddressSpaceBuilder
{
public IAddressSpaceBuilder Folder(string browseName, string displayName)
=> new CapturingBuilder(inner.Folder(browseName, displayName), sinks);
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
=> new CapturingHandle(inner.Variable(browseName, displayName, attributeInfo), sinks);
public void AddProperty(string browseName, DriverDataType dataType, object? value)
=> inner.AddProperty(browseName, dataType, value);
}
private sealed class CapturingHandle(
IVariableHandle inner,
ConcurrentDictionary<string, IAlarmConditionSink> sinks) : IVariableHandle
{
public string FullReference => inner.FullReference;
public IAlarmConditionSink MarkAsAlarmCondition(AlarmConditionInfo info)
{
var sink = inner.MarkAsAlarmCondition(info);
// Register by the driver-side full reference so the alarm forwarder can look it up
// using AlarmEventArgs.SourceNodeId (which the driver populates with the same tag).
sinks[inner.FullReference] = sink;
return sink;
}
}
}

View File

@@ -0,0 +1,91 @@
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.OpcUa;
/// <summary>
/// Phase 6.4 Stream D: materializes the OPC 40010 Machinery companion-spec Identification
/// sub-folder under an Equipment node. Reads the nine decision-#139 columns off the
/// <see cref="Equipment"/> row and emits one property per non-null field.
/// </summary>
/// <remarks>
/// <para>Pure-function shape — testable without a real OPC UA node manager. The caller
/// passes the builder scoped to the Equipment node; this class handles the Identification
/// sub-folder creation + per-field <see cref="IAddressSpaceBuilder.AddProperty"/> calls.</para>
///
/// <para>ACL binding: the sub-folder + its properties inherit the Equipment scope's
/// grants (no new scope level). Phase 6.2's trie treats them as part of the Equipment
/// ScopeId — a user with Equipment-level grant reads Identification; a user without the
/// grant gets BadUserAccessDenied on both the Equipment node + its Identification variables.
/// See <c>docs/v2/acl-design.md</c> §Identification cross-reference.</para>
///
/// <para>The nine fields per decision #139 are exposed exactly when they carry a non-null
/// value. A row with all nine null produces no Identification sub-folder at all — the
/// caller can use <see cref="HasAnyFields(Equipment)"/> to skip the Folder call entirely
/// and avoid a pointless empty folder appearing in browse trees.</para>
/// </remarks>
public static class IdentificationFolderBuilder
{
/// <summary>Browse + display name of the sub-folder — fixed per OPC 40010 convention.</summary>
public const string FolderName = "Identification";
/// <summary>
/// Canonical decision #139 field set exposed in the Identification sub-folder. Order
/// matches the decision-log entry so any browse-order reader can cross-reference
/// without re-sorting.
/// </summary>
public static IReadOnlyList<string> FieldNames { get; } = new[]
{
"Manufacturer", "Model", "SerialNumber",
"HardwareRevision", "SoftwareRevision",
"YearOfConstruction", "AssetLocation",
"ManufacturerUri", "DeviceManualUri",
};
/// <summary>True when the equipment row has at least one non-null Identification field.</summary>
public static bool HasAnyFields(Equipment equipment)
{
ArgumentNullException.ThrowIfNull(equipment);
return equipment.Manufacturer is not null
|| equipment.Model is not null
|| equipment.SerialNumber is not null
|| equipment.HardwareRevision is not null
|| equipment.SoftwareRevision is not null
|| equipment.YearOfConstruction is not null
|| equipment.AssetLocation is not null
|| equipment.ManufacturerUri is not null
|| equipment.DeviceManualUri is not null;
}
/// <summary>
/// Build the Identification sub-folder under <paramref name="equipmentBuilder"/>. No-op
/// when every field is null. Returns the sub-folder builder (or null when no-op) so
/// callers can attach additional nodes underneath if needed.
/// </summary>
public static IAddressSpaceBuilder? Build(IAddressSpaceBuilder equipmentBuilder, Equipment equipment)
{
ArgumentNullException.ThrowIfNull(equipmentBuilder);
ArgumentNullException.ThrowIfNull(equipment);
if (!HasAnyFields(equipment)) return null;
var folder = equipmentBuilder.Folder(FolderName, FolderName);
AddIfPresent(folder, "Manufacturer", DriverDataType.String, equipment.Manufacturer);
AddIfPresent(folder, "Model", DriverDataType.String, equipment.Model);
AddIfPresent(folder, "SerialNumber", DriverDataType.String, equipment.SerialNumber);
AddIfPresent(folder, "HardwareRevision", DriverDataType.String, equipment.HardwareRevision);
AddIfPresent(folder, "SoftwareRevision", DriverDataType.String, equipment.SoftwareRevision);
AddIfPresent(folder, "YearOfConstruction", DriverDataType.Int32,
equipment.YearOfConstruction is null ? null : (object)(int)equipment.YearOfConstruction.Value);
AddIfPresent(folder, "AssetLocation", DriverDataType.String, equipment.AssetLocation);
AddIfPresent(folder, "ManufacturerUri", DriverDataType.String, equipment.ManufacturerUri);
AddIfPresent(folder, "DeviceManualUri", DriverDataType.String, equipment.DeviceManualUri);
return folder;
}
private static void AddIfPresent(IAddressSpaceBuilder folder, string name, DriverDataType dataType, object? value)
{
if (value is null) return;
folder.AddProperty(name, dataType, value);
}
}

View File

@@ -0,0 +1,129 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Wraps the three mutating surfaces of <see cref="IAlarmSource"/>
/// (<see cref="IAlarmSource.SubscribeAlarmsAsync"/>, <see cref="IAlarmSource.UnsubscribeAlarmsAsync"/>,
/// <see cref="IAlarmSource.AcknowledgeAsync"/>) through <see cref="CapabilityInvoker"/> so the
/// Phase 6.1 resilience pipeline runs — retry semantics match
/// <see cref="DriverCapability.AlarmSubscribe"/> (retries by default) and
/// <see cref="DriverCapability.AlarmAcknowledge"/> (does NOT retry per decision #143).
/// </summary>
/// <remarks>
/// <para>Multi-host dispatch: when the driver implements <see cref="IPerCallHostResolver"/>,
/// each source-node-id is resolved individually + grouped by host so a dead PLC inside a
/// multi-device driver doesn't poison the sibling hosts' breakers. Drivers with a single
/// host fall back to <see cref="IDriver.DriverInstanceId"/> as the single-host key.</para>
///
/// <para>Why this lives here + not on <see cref="CapabilityInvoker"/>: alarm surfaces have a
/// handle-returning shape (SubscribeAlarmsAsync returns <see cref="IAlarmSubscriptionHandle"/>)
/// + a per-call fan-out (AcknowledgeAsync gets a batch of
/// <see cref="AlarmAcknowledgeRequest"/>s that may span multiple hosts). Keeping the fan-out
/// logic here keeps the invoker's execute-overloads narrow.</para>
/// </remarks>
public sealed class AlarmSurfaceInvoker
{
private readonly CapabilityInvoker _invoker;
private readonly IAlarmSource _alarmSource;
private readonly IPerCallHostResolver? _hostResolver;
private readonly string _defaultHost;
public AlarmSurfaceInvoker(
CapabilityInvoker invoker,
IAlarmSource alarmSource,
string defaultHost,
IPerCallHostResolver? hostResolver = null)
{
ArgumentNullException.ThrowIfNull(invoker);
ArgumentNullException.ThrowIfNull(alarmSource);
ArgumentException.ThrowIfNullOrWhiteSpace(defaultHost);
_invoker = invoker;
_alarmSource = alarmSource;
_defaultHost = defaultHost;
_hostResolver = hostResolver;
}
/// <summary>
/// Subscribe to alarm events for a set of source node ids, fanning out by resolved host
/// so per-host breakers / bulkheads apply. Returns one handle per host — callers that
/// don't care about per-host separation may concatenate them.
/// </summary>
public async Task<IReadOnlyList<IAlarmSubscriptionHandle>> SubscribeAsync(
IReadOnlyList<string> sourceNodeIds,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(sourceNodeIds);
if (sourceNodeIds.Count == 0) return [];
var byHost = GroupByHost(sourceNodeIds);
var handles = new List<IAlarmSubscriptionHandle>(byHost.Count);
foreach (var (host, ids) in byHost)
{
var handle = await _invoker.ExecuteAsync(
DriverCapability.AlarmSubscribe,
host,
async ct => await _alarmSource.SubscribeAlarmsAsync(ids, ct).ConfigureAwait(false),
cancellationToken).ConfigureAwait(false);
handles.Add(handle);
}
return handles;
}
/// <summary>Cancel an alarm subscription. Routes through the AlarmSubscribe pipeline for parity.</summary>
public ValueTask UnsubscribeAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(handle);
return _invoker.ExecuteAsync(
DriverCapability.AlarmSubscribe,
_defaultHost,
async ct => await _alarmSource.UnsubscribeAlarmsAsync(handle, ct).ConfigureAwait(false),
cancellationToken);
}
/// <summary>
/// Acknowledge alarms. Fans out by resolved host; each host's batch runs through the
/// AlarmAcknowledge pipeline (no-retry per decision #143 — an alarm-ack is not idempotent
/// at the plant-floor acknowledgement level even if the OPC UA spec permits re-issue).
/// </summary>
public async Task AcknowledgeAsync(
IReadOnlyList<AlarmAcknowledgeRequest> acknowledgements,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(acknowledgements);
if (acknowledgements.Count == 0) return;
var byHost = _hostResolver is null
? new Dictionary<string, List<AlarmAcknowledgeRequest>> { [_defaultHost] = acknowledgements.ToList() }
: acknowledgements
.GroupBy(a => _hostResolver.ResolveHost(a.SourceNodeId))
.ToDictionary(g => g.Key, g => g.ToList());
foreach (var (host, batch) in byHost)
{
var batchSnapshot = batch; // capture for the lambda
await _invoker.ExecuteAsync(
DriverCapability.AlarmAcknowledge,
host,
async ct => await _alarmSource.AcknowledgeAsync(batchSnapshot, ct).ConfigureAwait(false),
cancellationToken).ConfigureAwait(false);
}
}
private Dictionary<string, List<string>> GroupByHost(IReadOnlyList<string> sourceNodeIds)
{
if (_hostResolver is null)
return new Dictionary<string, List<string>> { [_defaultHost] = sourceNodeIds.ToList() };
var result = new Dictionary<string, List<string>>(StringComparer.Ordinal);
foreach (var id in sourceNodeIds)
{
var host = _hostResolver.ResolveHost(id);
if (!result.TryGetValue(host, out var list))
result[host] = list = new List<string>();
list.Add(id);
}
return result;
}
}

View File

@@ -0,0 +1,140 @@
using Polly;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.Observability;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Executes driver-capability calls through a shared Polly pipeline. One invoker per
/// <c>(DriverInstance, IDriver)</c> pair; the underlying <see cref="DriverResiliencePipelineBuilder"/>
/// is process-singleton so all invokers share its cache.
/// </summary>
/// <remarks>
/// Per <c>docs/v2/plan.md</c> decisions #143-144 and Phase 6.1 Stream A.3. The server's dispatch
/// layer routes every capability call (<c>IReadable.ReadAsync</c>, <c>IWritable.WriteAsync</c>,
/// <c>ITagDiscovery.DiscoverAsync</c>, <c>ISubscribable.SubscribeAsync/UnsubscribeAsync</c>,
/// <c>IHostConnectivityProbe</c> probe loop, <c>IAlarmSource.SubscribeAlarmsAsync/AcknowledgeAsync</c>,
/// and all four <c>IHistoryProvider</c> reads) through this invoker.
/// </remarks>
public sealed class CapabilityInvoker
{
private readonly DriverResiliencePipelineBuilder _builder;
private readonly string _driverInstanceId;
private readonly string _driverType;
private readonly Func<DriverResilienceOptions> _optionsAccessor;
private readonly DriverResilienceStatusTracker? _statusTracker;
/// <summary>
/// Construct an invoker for one driver instance.
/// </summary>
/// <param name="builder">Shared, process-singleton pipeline builder.</param>
/// <param name="driverInstanceId">The <c>DriverInstance.Id</c> column value.</param>
/// <param name="optionsAccessor">
/// Snapshot accessor for the current resilience options. Invoked per call so Admin-edit +
/// pipeline-invalidate can take effect without restarting the invoker.
/// </param>
/// <param name="driverType">Driver type name for structured-log enrichment (e.g. <c>"Modbus"</c>).</param>
/// <param name="statusTracker">Optional resilience-status tracker. When wired, every capability call records start/complete so Admin <c>/hosts</c> can surface <see cref="ResilienceStatusSnapshot.CurrentInFlight"/> as the bulkhead-depth proxy.</param>
public CapabilityInvoker(
DriverResiliencePipelineBuilder builder,
string driverInstanceId,
Func<DriverResilienceOptions> optionsAccessor,
string driverType = "Unknown",
DriverResilienceStatusTracker? statusTracker = null)
{
ArgumentNullException.ThrowIfNull(builder);
ArgumentNullException.ThrowIfNull(optionsAccessor);
_builder = builder;
_driverInstanceId = driverInstanceId;
_driverType = driverType;
_optionsAccessor = optionsAccessor;
_statusTracker = statusTracker;
}
/// <summary>Execute a capability call returning a value, honoring the per-capability pipeline.</summary>
/// <typeparam name="TResult">Return type of the underlying driver call.</typeparam>
public async ValueTask<TResult> ExecuteAsync<TResult>(
DriverCapability capability,
string hostName,
Func<CancellationToken, ValueTask<TResult>> callSite,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(callSite);
var pipeline = ResolvePipeline(capability, hostName);
_statusTracker?.RecordCallStart(_driverInstanceId, hostName);
try
{
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
{
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
}
}
finally
{
_statusTracker?.RecordCallComplete(_driverInstanceId, hostName);
}
}
/// <summary>Execute a void-returning capability call, honoring the per-capability pipeline.</summary>
public async ValueTask ExecuteAsync(
DriverCapability capability,
string hostName,
Func<CancellationToken, ValueTask> callSite,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(callSite);
var pipeline = ResolvePipeline(capability, hostName);
_statusTracker?.RecordCallStart(_driverInstanceId, hostName);
try
{
using (LogContextEnricher.Push(_driverInstanceId, _driverType, capability, LogContextEnricher.NewCorrelationId()))
{
await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
}
}
finally
{
_statusTracker?.RecordCallComplete(_driverInstanceId, hostName);
}
}
/// <summary>
/// Execute a <see cref="DriverCapability.Write"/> call honoring <see cref="WriteIdempotentAttribute"/>
/// semantics — if <paramref name="isIdempotent"/> is <c>false</c>, retries are disabled regardless
/// of the tag-level configuration (the pipeline for a non-idempotent write never retries per
/// decisions #44-45). If <c>true</c>, the call runs through the capability's pipeline which may
/// retry when the tier configuration permits.
/// </summary>
public async ValueTask<TResult> ExecuteWriteAsync<TResult>(
string hostName,
bool isIdempotent,
Func<CancellationToken, ValueTask<TResult>> callSite,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(callSite);
if (!isIdempotent)
{
var noRetryOptions = _optionsAccessor() with
{
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
{
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
},
};
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);
using (LogContextEnricher.Push(_driverInstanceId, _driverType, DriverCapability.Write, LogContextEnricher.NewCorrelationId()))
{
return await pipeline.ExecuteAsync(callSite, cancellationToken).ConfigureAwait(false);
}
}
return await ExecuteAsync(DriverCapability.Write, hostName, callSite, cancellationToken).ConfigureAwait(false);
}
private ResiliencePipeline ResolvePipeline(DriverCapability capability, string hostName) =>
_builder.GetOrCreate(_driverInstanceId, hostName, capability, _optionsAccessor());
}

View File

@@ -0,0 +1,106 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Per-tier × per-capability resilience policy configuration for a driver instance.
/// Bound from <c>DriverInstance.ResilienceConfig</c> JSON (nullable column; null = tier defaults).
/// Per <c>docs/v2/plan.md</c> decisions #143 and #144.
/// </summary>
public sealed record DriverResilienceOptions
{
/// <summary>Tier the owning driver type is registered as; drives the default map.</summary>
public required DriverTier Tier { get; init; }
/// <summary>
/// Per-capability policy overrides. Capabilities absent from this map fall back to
/// <see cref="GetTierDefaults(DriverTier)"/> for the configured <see cref="Tier"/>.
/// </summary>
public IReadOnlyDictionary<DriverCapability, CapabilityPolicy> CapabilityPolicies { get; init; }
= new Dictionary<DriverCapability, CapabilityPolicy>();
/// <summary>Bulkhead (max concurrent in-flight calls) for every capability. Default 32.</summary>
public int BulkheadMaxConcurrent { get; init; } = 32;
/// <summary>
/// Bulkhead queue depth. Zero = no queueing; overflow fails fast with
/// <c>BulkheadRejectedException</c>. Default 64.
/// </summary>
public int BulkheadMaxQueue { get; init; } = 64;
/// <summary>
/// Periodic scheduled recycle interval for Tier C out-of-process hosts, in seconds.
/// Null (the default) = no scheduled recycle; the driver's Host process keeps running
/// indefinitely unless a memory breach or operator action triggers a recycle. Only
/// respected for <see cref="DriverTier.C"/>; Tier A/B recycle would tear down every
/// OPC UA session, so the loader ignores non-null values for those tiers + logs a
/// warning (per decisions #74 / #145).
/// </summary>
public int? RecycleIntervalSeconds { get; init; }
/// <summary>
/// Look up the effective policy for a capability, falling back to tier defaults when no
/// override is configured. Never returns null.
/// </summary>
public CapabilityPolicy Resolve(DriverCapability capability)
{
if (CapabilityPolicies.TryGetValue(capability, out var policy))
return policy;
var defaults = GetTierDefaults(Tier);
return defaults[capability];
}
/// <summary>
/// Per-tier per-capability default policy table, per decisions #143-144 and the Phase 6.1
/// Stream A.2 specification. Retries skipped on <see cref="DriverCapability.Write"/> and
/// <see cref="DriverCapability.AlarmAcknowledge"/> regardless of tier.
/// </summary>
public static IReadOnlyDictionary<DriverCapability, CapabilityPolicy> GetTierDefaults(DriverTier tier) =>
tier switch
{
DriverTier.A => new Dictionary<DriverCapability, CapabilityPolicy>
{
[DriverCapability.Read] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.Write] = new(TimeoutSeconds: 2, RetryCount: 0, BreakerFailureThreshold: 5),
[DriverCapability.Discover] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 3),
[DriverCapability.Subscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.Probe] = new(TimeoutSeconds: 2, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 5, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 5, RetryCount: 0, BreakerFailureThreshold: 5),
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 30, RetryCount: 2, BreakerFailureThreshold: 5),
},
DriverTier.B => new Dictionary<DriverCapability, CapabilityPolicy>
{
[DriverCapability.Read] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.Write] = new(TimeoutSeconds: 4, RetryCount: 0, BreakerFailureThreshold: 5),
[DriverCapability.Discover] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 3),
[DriverCapability.Subscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.Probe] = new(TimeoutSeconds: 4, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 8, RetryCount: 3, BreakerFailureThreshold: 5),
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 8, RetryCount: 0, BreakerFailureThreshold: 5),
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 60, RetryCount: 2, BreakerFailureThreshold: 5),
},
DriverTier.C => new Dictionary<DriverCapability, CapabilityPolicy>
{
[DriverCapability.Read] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
[DriverCapability.Write] = new(TimeoutSeconds: 10, RetryCount: 0, BreakerFailureThreshold: 0),
[DriverCapability.Discover] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
[DriverCapability.Subscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
[DriverCapability.Probe] = new(TimeoutSeconds: 10, RetryCount: 1, BreakerFailureThreshold: 0),
[DriverCapability.AlarmSubscribe] = new(TimeoutSeconds: 15, RetryCount: 1, BreakerFailureThreshold: 0),
[DriverCapability.AlarmAcknowledge] = new(TimeoutSeconds: 15, RetryCount: 0, BreakerFailureThreshold: 0),
[DriverCapability.HistoryRead] = new(TimeoutSeconds: 120, RetryCount: 1, BreakerFailureThreshold: 0),
},
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No default policy table defined for tier {tier}."),
};
}
/// <summary>Policy for one capability on one driver instance.</summary>
/// <param name="TimeoutSeconds">Per-call timeout (wraps the inner Polly execution).</param>
/// <param name="RetryCount">Number of retry attempts after the first failure; zero = no retry.</param>
/// <param name="BreakerFailureThreshold">
/// Consecutive-failure count that opens the circuit breaker; zero = no breaker
/// (Tier C uses the supervisor's process-level breaker instead, per decision #68).
/// </param>
public sealed record CapabilityPolicy(int TimeoutSeconds, int RetryCount, int BreakerFailureThreshold);

View File

@@ -0,0 +1,132 @@
using System.Text.Json;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Parses the <c>DriverInstance.ResilienceConfig</c> JSON column into a
/// <see cref="DriverResilienceOptions"/> instance layered on top of the tier defaults.
/// Every key in the JSON is optional; missing keys fall back to the tier defaults from
/// <see cref="DriverResilienceOptions.GetTierDefaults(DriverTier)"/>.
/// </summary>
/// <remarks>
/// <para>Example JSON shape per Phase 6.1 Stream A.2:</para>
/// <code>
/// {
/// "bulkheadMaxConcurrent": 16,
/// "bulkheadMaxQueue": 64,
/// "capabilityPolicies": {
/// "Read": { "timeoutSeconds": 5, "retryCount": 5, "breakerFailureThreshold": 3 },
/// "Write": { "timeoutSeconds": 5, "retryCount": 0, "breakerFailureThreshold": 5 }
/// }
/// }
/// </code>
///
/// <para>Unrecognised keys + values are ignored so future shapes land without a migration.
/// Per-capability overrides are layered on top of tier defaults — a partial policy (only
/// some of TimeoutSeconds/RetryCount/BreakerFailureThreshold) fills in the other fields
/// from the tier default for that capability.</para>
///
/// <para>Parser failures (malformed JSON, type mismatches) fall back to pure tier defaults
/// + surface through an out-parameter diagnostic. Callers may log the diagnostic but should
/// NOT fail driver startup — a misconfigured ResilienceConfig should never brick a
/// working driver.</para>
/// </remarks>
public static class DriverResilienceOptionsParser
{
private static readonly JsonSerializerOptions JsonOpts = new()
{
PropertyNameCaseInsensitive = true,
AllowTrailingCommas = true,
ReadCommentHandling = JsonCommentHandling.Skip,
};
/// <summary>
/// Parse the JSON payload layered on <paramref name="tier"/>'s defaults. Returns the
/// effective options; <paramref name="parseDiagnostic"/> is null on success, or a
/// human-readable error message when the JSON was malformed (options still returned
/// = tier defaults).
/// </summary>
public static DriverResilienceOptions ParseOrDefaults(
DriverTier tier,
string? resilienceConfigJson,
out string? parseDiagnostic)
{
parseDiagnostic = null;
var baseDefaults = DriverResilienceOptions.GetTierDefaults(tier);
var baseOptions = new DriverResilienceOptions { Tier = tier, CapabilityPolicies = baseDefaults };
if (string.IsNullOrWhiteSpace(resilienceConfigJson))
return baseOptions;
ResilienceConfigShape? shape;
try
{
shape = JsonSerializer.Deserialize<ResilienceConfigShape>(resilienceConfigJson, JsonOpts);
}
catch (JsonException ex)
{
parseDiagnostic = $"ResilienceConfig JSON malformed; falling back to tier {tier} defaults. Detail: {ex.Message}";
return baseOptions;
}
if (shape is null) return baseOptions;
var merged = new Dictionary<DriverCapability, CapabilityPolicy>(baseDefaults);
if (shape.CapabilityPolicies is not null)
{
foreach (var (capName, overridePolicy) in shape.CapabilityPolicies)
{
if (!Enum.TryParse<DriverCapability>(capName, ignoreCase: true, out var capability))
{
parseDiagnostic ??= $"Unknown capability '{capName}' in ResilienceConfig; skipped.";
continue;
}
var basePolicy = merged[capability];
merged[capability] = new CapabilityPolicy(
TimeoutSeconds: overridePolicy.TimeoutSeconds ?? basePolicy.TimeoutSeconds,
RetryCount: overridePolicy.RetryCount ?? basePolicy.RetryCount,
BreakerFailureThreshold: overridePolicy.BreakerFailureThreshold ?? basePolicy.BreakerFailureThreshold);
}
}
// Scheduled recycle is Tier C only — reject a configured interval on Tier A/B as a
// misconfiguration surface rather than silently honouring it (recycling an in-process
// driver would kill every OPC UA session + every co-hosted driver, per decision #74).
int? recycleIntervalSeconds = null;
if (shape.RecycleIntervalSeconds is int secs)
{
if (secs <= 0)
parseDiagnostic ??= $"RecycleIntervalSeconds must be positive; got {secs} — ignoring.";
else if (tier != DriverTier.C)
parseDiagnostic ??= $"RecycleIntervalSeconds is Tier C only; Tier {tier} driver will not scheduled-recycle.";
else
recycleIntervalSeconds = secs;
}
return new DriverResilienceOptions
{
Tier = tier,
CapabilityPolicies = merged,
BulkheadMaxConcurrent = shape.BulkheadMaxConcurrent ?? baseOptions.BulkheadMaxConcurrent,
BulkheadMaxQueue = shape.BulkheadMaxQueue ?? baseOptions.BulkheadMaxQueue,
RecycleIntervalSeconds = recycleIntervalSeconds,
};
}
private sealed class ResilienceConfigShape
{
public int? BulkheadMaxConcurrent { get; set; }
public int? BulkheadMaxQueue { get; set; }
public int? RecycleIntervalSeconds { get; set; }
public Dictionary<string, CapabilityPolicyShape>? CapabilityPolicies { get; set; }
}
private sealed class CapabilityPolicyShape
{
public int? TimeoutSeconds { get; set; }
public int? RetryCount { get; set; }
public int? BreakerFailureThreshold { get; set; }
}
}

View File

@@ -0,0 +1,157 @@
using System.Collections.Concurrent;
using Polly;
using Polly.CircuitBreaker;
using Polly.Retry;
using Polly.Timeout;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Builds and caches Polly resilience pipelines keyed on
/// <c>(DriverInstanceId, HostName, DriverCapability)</c>. One dead PLC behind a multi-device
/// driver cannot open the circuit breaker for healthy sibling hosts.
/// </summary>
/// <remarks>
/// Per <c>docs/v2/plan.md</c> decision #144 (per-device isolation). Composition from outside-in:
/// <b>Timeout → Retry (when capability permits) → Circuit Breaker (when tier permits) → Bulkhead</b>.
///
/// <para>Pipeline resolution is lock-free on the hot path: the inner
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> caches a <see cref="ResiliencePipeline"/> per key;
/// first-call cost is one <see cref="ResiliencePipelineBuilder"/>.Build. Thereafter reads are O(1).</para>
/// </remarks>
public sealed class DriverResiliencePipelineBuilder
{
private readonly ConcurrentDictionary<PipelineKey, ResiliencePipeline> _pipelines = new();
private readonly TimeProvider _timeProvider;
private readonly DriverResilienceStatusTracker? _statusTracker;
/// <summary>Construct with the ambient clock (use <see cref="TimeProvider.System"/> in prod).</summary>
/// <param name="timeProvider">Clock source for pipeline timeouts + breaker sampling. Defaults to system.</param>
/// <param name="statusTracker">When non-null, every built pipeline wires Polly telemetry into
/// the tracker — retries increment <c>ConsecutiveFailures</c>, breaker-open stamps
/// <c>LastBreakerOpenUtc</c>, breaker-close resets failures. Feeds Admin <c>/hosts</c> +
/// the Polly bulkhead-depth column. Absent tracker means no telemetry (unit tests +
/// deployments that don't care about resilience observability).</param>
public DriverResiliencePipelineBuilder(
TimeProvider? timeProvider = null,
DriverResilienceStatusTracker? statusTracker = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
_statusTracker = statusTracker;
}
/// <summary>
/// Get or build the pipeline for a given <c>(driver instance, host, capability)</c> triple.
/// Calls with the same key + same options reuse the same pipeline instance; the first caller
/// wins if a race occurs (both pipelines would be behaviourally identical).
/// </summary>
/// <param name="driverInstanceId">DriverInstance primary key — opaque to this layer.</param>
/// <param name="hostName">
/// Host the call targets. For single-host drivers (Galaxy, some OPC UA Client configs) pass the
/// driver's canonical host string. For multi-host drivers (Modbus with N PLCs), pass the
/// specific PLC so one dead PLC doesn't poison healthy siblings.
/// </param>
/// <param name="capability">Which capability surface is being called.</param>
/// <param name="options">Per-driver-instance options (tier + per-capability overrides).</param>
public ResiliencePipeline GetOrCreate(
string driverInstanceId,
string hostName,
DriverCapability capability,
DriverResilienceOptions options)
{
ArgumentNullException.ThrowIfNull(options);
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
var key = new PipelineKey(driverInstanceId, hostName, capability);
return _pipelines.GetOrAdd(key, static (k, state) => Build(
k.DriverInstanceId, k.HostName, state.capability, state.options, state.timeProvider, state.tracker),
(capability, options, timeProvider: _timeProvider, tracker: _statusTracker));
}
/// <summary>Drop cached pipelines for one driver instance (e.g. on ResilienceConfig change). Test + Admin-reload use.</summary>
public int Invalidate(string driverInstanceId)
{
var removed = 0;
foreach (var key in _pipelines.Keys)
{
if (key.DriverInstanceId == driverInstanceId && _pipelines.TryRemove(key, out _))
removed++;
}
return removed;
}
/// <summary>Snapshot of the current number of cached pipelines. For diagnostics only.</summary>
public int CachedPipelineCount => _pipelines.Count;
private static ResiliencePipeline Build(
string driverInstanceId,
string hostName,
DriverCapability capability,
DriverResilienceOptions options,
TimeProvider timeProvider,
DriverResilienceStatusTracker? tracker)
{
var policy = options.Resolve(capability);
var builder = new ResiliencePipelineBuilder { TimeProvider = timeProvider };
builder.AddTimeout(new TimeoutStrategyOptions
{
Timeout = TimeSpan.FromSeconds(policy.TimeoutSeconds),
});
if (policy.RetryCount > 0)
{
var retryOptions = new RetryStrategyOptions
{
MaxRetryAttempts = policy.RetryCount,
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(5),
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
};
if (tracker is not null)
{
retryOptions.OnRetry = args =>
{
tracker.RecordFailure(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddRetry(retryOptions);
}
if (policy.BreakerFailureThreshold > 0)
{
var breakerOptions = new CircuitBreakerStrategyOptions
{
FailureRatio = 1.0,
MinimumThroughput = policy.BreakerFailureThreshold,
SamplingDuration = TimeSpan.FromSeconds(30),
BreakDuration = TimeSpan.FromSeconds(15),
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
};
if (tracker is not null)
{
breakerOptions.OnOpened = args =>
{
tracker.RecordBreakerOpen(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
breakerOptions.OnClosed = args =>
{
// Closing the breaker means the target recovered — reset the consecutive-
// failure counter so Admin UI stops flashing red for this host.
tracker.RecordSuccess(driverInstanceId, hostName, timeProvider.GetUtcNow().UtcDateTime);
return default;
};
}
builder.AddCircuitBreaker(breakerOptions);
}
return builder.Build();
}
private readonly record struct PipelineKey(string DriverInstanceId, string HostName, DriverCapability Capability);
}

View File

@@ -0,0 +1,135 @@
using System.Collections.Concurrent;
namespace ZB.MOM.WW.OtOpcUa.Core.Resilience;
/// <summary>
/// Process-singleton tracker of live resilience counters per
/// <c>(DriverInstanceId, HostName)</c>. Populated by the CapabilityInvoker and the
/// MemoryTracking layer; consumed by a HostedService that periodically persists a
/// snapshot to the <c>DriverInstanceResilienceStatus</c> table for Admin <c>/hosts</c>.
/// </summary>
/// <remarks>
/// Per Phase 6.1 Stream E. No DB dependency here — the tracker is pure in-memory so
/// tests can exercise it without EF Core or SQL Server. The HostedService that writes
/// snapshots lives in the Server project (Stream E.2); the actual SignalR push + Blazor
/// page refresh (E.3) lands in a follow-up visual-review PR.
/// </remarks>
public sealed class DriverResilienceStatusTracker
{
private readonly ConcurrentDictionary<StatusKey, ResilienceStatusSnapshot> _status = new();
/// <summary>Record a Polly pipeline failure for <paramref name="hostName"/>.</summary>
public void RecordFailure(string driverInstanceId, string hostName, DateTime utcNow)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 1, LastSampledUtc = utcNow },
(_, existing) => existing with
{
ConsecutiveFailures = existing.ConsecutiveFailures + 1,
LastSampledUtc = utcNow,
});
}
/// <summary>Reset the consecutive-failure count on a successful pipeline execution.</summary>
public void RecordSuccess(string driverInstanceId, string hostName, DateTime utcNow)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { ConsecutiveFailures = 0, LastSampledUtc = utcNow },
(_, existing) => existing with
{
ConsecutiveFailures = 0,
LastSampledUtc = utcNow,
});
}
/// <summary>Record a circuit-breaker open event.</summary>
public void RecordBreakerOpen(string driverInstanceId, string hostName, DateTime utcNow)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow },
(_, existing) => existing with { LastBreakerOpenUtc = utcNow, LastSampledUtc = utcNow });
}
/// <summary>Record a process recycle event (Tier C only).</summary>
public void RecordRecycle(string driverInstanceId, string hostName, DateTime utcNow)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { LastRecycleUtc = utcNow, LastSampledUtc = utcNow },
(_, existing) => existing with { LastRecycleUtc = utcNow, LastSampledUtc = utcNow });
}
/// <summary>Capture / update the MemoryTracking-supplied baseline + current footprint.</summary>
public void RecordFootprint(string driverInstanceId, string hostName, long baselineBytes, long currentBytes, DateTime utcNow)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot
{
BaselineFootprintBytes = baselineBytes,
CurrentFootprintBytes = currentBytes,
LastSampledUtc = utcNow,
},
(_, existing) => existing with
{
BaselineFootprintBytes = baselineBytes,
CurrentFootprintBytes = currentBytes,
LastSampledUtc = utcNow,
});
}
/// <summary>
/// Record the entry of a capability call for this (instance, host). Increments the
/// in-flight counter used as the <see cref="ResilienceStatusSnapshot.CurrentInFlight"/>
/// surface (a cheap stand-in for Polly bulkhead depth). Paired with
/// <see cref="RecordCallComplete"/>; callers use try/finally.
/// </summary>
public void RecordCallStart(string driverInstanceId, string hostName)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { CurrentInFlight = 1 },
(_, existing) => existing with { CurrentInFlight = existing.CurrentInFlight + 1 });
}
/// <summary>Paired with <see cref="RecordCallStart"/> — decrements the in-flight counter.</summary>
public void RecordCallComplete(string driverInstanceId, string hostName)
{
var key = new StatusKey(driverInstanceId, hostName);
_status.AddOrUpdate(key,
_ => new ResilienceStatusSnapshot { CurrentInFlight = 0 }, // start-without-complete shouldn't happen; clamp to 0
(_, existing) => existing with { CurrentInFlight = Math.Max(0, existing.CurrentInFlight - 1) });
}
/// <summary>Snapshot of a specific (instance, host) pair; null if no counters recorded yet.</summary>
public ResilienceStatusSnapshot? TryGet(string driverInstanceId, string hostName) =>
_status.TryGetValue(new StatusKey(driverInstanceId, hostName), out var snapshot) ? snapshot : null;
/// <summary>Copy of every currently-tracked (instance, host, snapshot) triple. Safe under concurrent writes.</summary>
public IReadOnlyList<(string DriverInstanceId, string HostName, ResilienceStatusSnapshot Snapshot)> Snapshot() =>
_status.Select(kvp => (kvp.Key.DriverInstanceId, kvp.Key.HostName, kvp.Value)).ToList();
private readonly record struct StatusKey(string DriverInstanceId, string HostName);
}
/// <summary>Snapshot of the resilience counters for one <c>(DriverInstanceId, HostName)</c> pair.</summary>
public sealed record ResilienceStatusSnapshot
{
public int ConsecutiveFailures { get; init; }
public DateTime? LastBreakerOpenUtc { get; init; }
public DateTime? LastRecycleUtc { get; init; }
public long BaselineFootprintBytes { get; init; }
public long CurrentFootprintBytes { get; init; }
public DateTime LastSampledUtc { get; init; }
/// <summary>
/// In-flight capability calls against this (instance, host). Bumped on call entry +
/// decremented on completion. Feeds <c>DriverInstanceResilienceStatus.CurrentBulkheadDepth</c>
/// for Admin <c>/hosts</c> — a cheap proxy for the Polly bulkhead depth until the full
/// telemetry observer lands.
/// </summary>
public int CurrentInFlight { get; init; }
}

View File

@@ -0,0 +1,65 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
/// <summary>
/// Tier C only process-recycle companion to <see cref="MemoryTracking"/>. On a
/// <see cref="MemoryTrackingAction.HardBreach"/> signal, invokes the supplied
/// <see cref="IDriverSupervisor"/> to restart the out-of-process Host.
/// </summary>
/// <remarks>
/// Per <c>docs/v2/plan.md</c> decisions #74 and #145. Tier A/B hard-breach on an in-process
/// driver would kill every OPC UA session and every co-hosted driver, so for Tier A/B this
/// class logs a <b>promotion-to-Tier-C recommendation</b> and does NOT invoke any supervisor.
/// A future tier-migration workflow acts on the recommendation.
/// </remarks>
public sealed class MemoryRecycle
{
private readonly DriverTier _tier;
private readonly IDriverSupervisor? _supervisor;
private readonly ILogger<MemoryRecycle> _logger;
public MemoryRecycle(DriverTier tier, IDriverSupervisor? supervisor, ILogger<MemoryRecycle> logger)
{
_tier = tier;
_supervisor = supervisor;
_logger = logger;
}
/// <summary>
/// Handle a <see cref="MemoryTracking"/> classification for the driver. For Tier C with a
/// wired supervisor, <c>HardBreach</c> triggers <see cref="IDriverSupervisor.RecycleAsync"/>.
/// All other combinations are no-ops with respect to process state (soft breaches + Tier A/B
/// hard breaches just log).
/// </summary>
/// <returns>True when a recycle was requested; false otherwise.</returns>
public async Task<bool> HandleAsync(MemoryTrackingAction action, long footprintBytes, CancellationToken cancellationToken)
{
switch (action)
{
case MemoryTrackingAction.SoftBreach:
_logger.LogWarning(
"Memory soft-breach on driver {DriverId}: footprint={Footprint:N0} bytes, tier={Tier}. Surfaced to Admin; no action.",
_supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes, _tier);
return false;
case MemoryTrackingAction.HardBreach when _tier == DriverTier.C && _supervisor is not null:
_logger.LogError(
"Memory hard-breach on Tier C driver {DriverId}: footprint={Footprint:N0} bytes. Requesting supervisor recycle.",
_supervisor.DriverInstanceId, footprintBytes);
await _supervisor.RecycleAsync($"Memory hard-breach: {footprintBytes} bytes", cancellationToken).ConfigureAwait(false);
return true;
case MemoryTrackingAction.HardBreach:
_logger.LogError(
"Memory hard-breach on Tier {Tier} in-process driver {DriverId}: footprint={Footprint:N0} bytes. " +
"Recommending promotion to Tier C; NOT auto-killing (decisions #74, #145).",
_tier, _supervisor?.DriverInstanceId ?? "(unknown)", footprintBytes);
return false;
default:
return false;
}
}
}

View File

@@ -0,0 +1,136 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
/// <summary>
/// Tier-agnostic memory-footprint tracker. Captures the post-initialize <b>baseline</b>
/// from the first samples after <c>IDriver.InitializeAsync</c>, then classifies each
/// subsequent sample against a hybrid soft/hard threshold per
/// <c>docs/v2/plan.md</c> decision #146 — <c>soft = max(multiplier × baseline, baseline + floor)</c>,
/// <c>hard = 2 × soft</c>.
/// </summary>
/// <remarks>
/// <para>Per decision #145, this tracker <b>never kills a process</b>. Soft and hard breaches
/// log + surface to the Admin UI via <c>DriverInstanceResilienceStatus</c>. The matching
/// process-level recycle protection lives in a separate <c>MemoryRecycle</c> that activates
/// for Tier C drivers only (where the driver runs out-of-process behind a supervisor that
/// can safely restart it without tearing down the OPC UA session or co-hosted in-proc
/// drivers).</para>
///
/// <para>Baseline capture: the tracker starts in <see cref="TrackingPhase.WarmingUp"/> for
/// <see cref="BaselineWindow"/> (default 5 min). During that window samples are collected;
/// the baseline is computed as the median once the window elapses. Before that point every
/// classification returns <see cref="MemoryTrackingAction.Warming"/>.</para>
/// </remarks>
public sealed class MemoryTracking
{
private readonly DriverTier _tier;
private readonly TimeSpan _baselineWindow;
private readonly List<long> _warmupSamples = [];
private long _baselineBytes;
private TrackingPhase _phase = TrackingPhase.WarmingUp;
private DateTime? _warmupStartUtc;
/// <summary>Tier-default multiplier/floor constants per decision #146.</summary>
public static (int Multiplier, long FloorBytes) GetTierConstants(DriverTier tier) => tier switch
{
DriverTier.A => (Multiplier: 3, FloorBytes: 50L * 1024 * 1024),
DriverTier.B => (Multiplier: 3, FloorBytes: 100L * 1024 * 1024),
DriverTier.C => (Multiplier: 2, FloorBytes: 500L * 1024 * 1024),
_ => throw new ArgumentOutOfRangeException(nameof(tier), tier, $"No memory-tracking constants defined for tier {tier}."),
};
/// <summary>Window over which post-init samples are collected to compute the baseline.</summary>
public TimeSpan BaselineWindow => _baselineWindow;
/// <summary>Current phase: <see cref="TrackingPhase.WarmingUp"/> or <see cref="TrackingPhase.Steady"/>.</summary>
public TrackingPhase Phase => _phase;
/// <summary>Captured baseline; 0 until warmup completes.</summary>
public long BaselineBytes => _baselineBytes;
/// <summary>Effective soft threshold (zero while warming up).</summary>
public long SoftThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes);
/// <summary>Effective hard threshold = 2 × soft (zero while warming up).</summary>
public long HardThresholdBytes => _baselineBytes == 0 ? 0 : ComputeSoft(_tier, _baselineBytes) * 2;
public MemoryTracking(DriverTier tier, TimeSpan? baselineWindow = null)
{
_tier = tier;
_baselineWindow = baselineWindow ?? TimeSpan.FromMinutes(5);
}
/// <summary>
/// Submit a memory-footprint sample. Returns the action the caller should surface.
/// During warmup, always returns <see cref="MemoryTrackingAction.Warming"/> and accumulates
/// samples; once the window elapses the first steady-phase sample triggers baseline capture
/// (median of warmup samples).
/// </summary>
public MemoryTrackingAction Sample(long footprintBytes, DateTime utcNow)
{
if (_phase == TrackingPhase.WarmingUp)
{
_warmupStartUtc ??= utcNow;
_warmupSamples.Add(footprintBytes);
if (utcNow - _warmupStartUtc.Value >= _baselineWindow && _warmupSamples.Count > 0)
{
_baselineBytes = ComputeMedian(_warmupSamples);
_phase = TrackingPhase.Steady;
}
else
{
return MemoryTrackingAction.Warming;
}
}
if (footprintBytes >= HardThresholdBytes) return MemoryTrackingAction.HardBreach;
if (footprintBytes >= SoftThresholdBytes) return MemoryTrackingAction.SoftBreach;
return MemoryTrackingAction.None;
}
private static long ComputeSoft(DriverTier tier, long baseline)
{
var (multiplier, floor) = GetTierConstants(tier);
return Math.Max(multiplier * baseline, baseline + floor);
}
private static long ComputeMedian(List<long> samples)
{
var sorted = samples.Order().ToArray();
var mid = sorted.Length / 2;
return sorted.Length % 2 == 1
? sorted[mid]
: (sorted[mid - 1] + sorted[mid]) / 2;
}
}
/// <summary>Phase of a <see cref="MemoryTracking"/> lifecycle.</summary>
public enum TrackingPhase
{
/// <summary>Collecting post-init samples; baseline not yet computed.</summary>
WarmingUp,
/// <summary>Baseline captured; every sample classified against soft/hard thresholds.</summary>
Steady,
}
/// <summary>Classification the tracker returns per sample.</summary>
public enum MemoryTrackingAction
{
/// <summary>Baseline not yet captured; sample collected, no threshold check.</summary>
Warming,
/// <summary>Below soft threshold.</summary>
None,
/// <summary>Between soft and hard thresholds — log + surface, no action.</summary>
SoftBreach,
/// <summary>
/// ≥ hard threshold. Log + surface + (Tier C only, via <c>MemoryRecycle</c>) request
/// process recycle via the driver supervisor. Tier A/B breach never invokes any
/// kill path per decisions #145 and #74.
/// </summary>
HardBreach,
}

View File

@@ -0,0 +1,86 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
/// <summary>
/// Tier C opt-in periodic-recycle driver per <c>docs/v2/plan.md</c> decision #67.
/// A tick method advanced by the caller (fed by a background timer in prod; by test clock
/// in unit tests) decides whether the configured interval has elapsed and, if so, drives the
/// supplied <see cref="IDriverSupervisor"/> to recycle the Host.
/// </summary>
/// <remarks>
/// Tier A/B drivers MUST NOT use this class — scheduled recycle for in-process drivers would
/// kill every OPC UA session and every co-hosted driver. The ctor throws when constructed
/// with any tier other than C to make the misuse structurally impossible.
///
/// <para>Keeps no background thread of its own — callers invoke <see cref="TickAsync"/> on
/// their ambient scheduler tick (Phase 6.1 Stream C's health-endpoint host runs one). That
/// decouples the unit under test from wall-clock time and thread-pool scheduling.</para>
/// </remarks>
public sealed class ScheduledRecycleScheduler
{
private readonly TimeSpan _recycleInterval;
private readonly IDriverSupervisor _supervisor;
private readonly ILogger<ScheduledRecycleScheduler> _logger;
private DateTime _nextRecycleUtc;
/// <summary>
/// Construct the scheduler for a Tier C driver. Throws if <paramref name="tier"/> isn't C.
/// </summary>
/// <param name="tier">Driver tier; must be <see cref="DriverTier.C"/>.</param>
/// <param name="recycleInterval">Interval between recycles (e.g. 7 days).</param>
/// <param name="startUtc">Anchor time; next recycle fires at <paramref name="startUtc"/> + <paramref name="recycleInterval"/>.</param>
/// <param name="supervisor">Supervisor that performs the actual recycle.</param>
/// <param name="logger">Diagnostic sink.</param>
public ScheduledRecycleScheduler(
DriverTier tier,
TimeSpan recycleInterval,
DateTime startUtc,
IDriverSupervisor supervisor,
ILogger<ScheduledRecycleScheduler> logger)
{
if (tier != DriverTier.C)
throw new ArgumentException(
$"ScheduledRecycleScheduler is Tier C only (got {tier}). " +
"In-process drivers must not use scheduled recycle; see decisions #74 and #145.",
nameof(tier));
if (recycleInterval <= TimeSpan.Zero)
throw new ArgumentException("RecycleInterval must be positive.", nameof(recycleInterval));
_recycleInterval = recycleInterval;
_supervisor = supervisor;
_logger = logger;
_nextRecycleUtc = startUtc + recycleInterval;
}
/// <summary>Next scheduled recycle UTC. Advances by <see cref="RecycleInterval"/> on each fire.</summary>
public DateTime NextRecycleUtc => _nextRecycleUtc;
/// <summary>Recycle interval this scheduler was constructed with.</summary>
public TimeSpan RecycleInterval => _recycleInterval;
/// <summary>
/// Tick the scheduler forward. If <paramref name="utcNow"/> is past
/// <see cref="NextRecycleUtc"/>, requests a recycle from the supervisor and advances
/// <see cref="NextRecycleUtc"/> by exactly one interval. Returns true when a recycle fired.
/// </summary>
public async Task<bool> TickAsync(DateTime utcNow, CancellationToken cancellationToken)
{
if (utcNow < _nextRecycleUtc)
return false;
_logger.LogInformation(
"Scheduled recycle due for Tier C driver {DriverId} at {Now:o}; advancing next to {Next:o}.",
_supervisor.DriverInstanceId, utcNow, _nextRecycleUtc + _recycleInterval);
await _supervisor.RecycleAsync("Scheduled periodic recycle", cancellationToken).ConfigureAwait(false);
_nextRecycleUtc += _recycleInterval;
return true;
}
/// <summary>Request an immediate recycle outside the schedule (e.g. MemoryRecycle hard-breach escalation).</summary>
public Task RequestRecycleNowAsync(string reason, CancellationToken cancellationToken) =>
_supervisor.RecycleAsync(reason, cancellationToken);
}

View File

@@ -0,0 +1,81 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Core.Stability;
/// <summary>
/// Demand-aware driver-wedge detector per <c>docs/v2/plan.md</c> decision #147.
/// Flips a driver to <see cref="WedgeVerdict.Faulted"/> only when BOTH of the following hold:
/// (a) there is pending work outstanding, AND (b) no progress has been observed for longer
/// than <see cref="Threshold"/>. Idle drivers, write-only burst drivers, and subscription-only
/// drivers whose signals don't arrive regularly all stay Healthy.
/// </summary>
/// <remarks>
/// <para>Pending work signal is supplied by the caller via <see cref="DemandSignal"/>:
/// non-zero Polly bulkhead depth, ≥1 active MonitoredItem, or ≥1 queued historian read
/// each qualifies. The detector itself is state-light: all it remembers is the last
/// <c>LastProgressUtc</c> it saw and the last wedge verdict. No history buffer.</para>
///
/// <para>Default threshold per plan: <c>5 × PublishingInterval</c>, with a minimum of 60 s.
/// Concrete values are driver-agnostic and configured per-instance by the caller.</para>
/// </remarks>
public sealed class WedgeDetector
{
/// <summary>Wedge-detection threshold; pass &lt; 60 s and the detector clamps to 60 s.</summary>
public TimeSpan Threshold { get; }
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
public WedgeDetector(TimeSpan threshold)
{
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;
}
/// <summary>
/// Classify the current state against the demand signal. Does not retain state across
/// calls — each call is self-contained; the caller owns the <c>LastProgressUtc</c> clock.
/// </summary>
public WedgeVerdict Classify(DriverState state, DemandSignal demand, DateTime utcNow)
{
if (state != DriverState.Healthy)
return WedgeVerdict.NotApplicable;
if (!demand.HasPendingWork)
return WedgeVerdict.Idle;
var sinceProgress = utcNow - demand.LastProgressUtc;
return sinceProgress > Threshold ? WedgeVerdict.Faulted : WedgeVerdict.Healthy;
}
}
/// <summary>
/// Caller-supplied demand snapshot. All three counters are OR'd — any non-zero means work
/// is outstanding, which is the trigger for checking the <see cref="LastProgressUtc"/> clock.
/// </summary>
/// <param name="BulkheadDepth">Polly bulkhead depth (in-flight capability calls).</param>
/// <param name="ActiveMonitoredItems">Number of live OPC UA MonitoredItems bound to this driver.</param>
/// <param name="QueuedHistoryReads">Pending historian-read requests the driver owes the server.</param>
/// <param name="LastProgressUtc">Last time the driver reported a successful unit of work (read, subscribe-ack, publish).</param>
public readonly record struct DemandSignal(
int BulkheadDepth,
int ActiveMonitoredItems,
int QueuedHistoryReads,
DateTime LastProgressUtc)
{
/// <summary>True when any of the three counters is &gt; 0.</summary>
public bool HasPendingWork => BulkheadDepth > 0 || ActiveMonitoredItems > 0 || QueuedHistoryReads > 0;
}
/// <summary>Outcome of a single <see cref="WedgeDetector.Classify"/> call.</summary>
public enum WedgeVerdict
{
/// <summary>Driver wasn't Healthy to begin with — wedge detection doesn't apply.</summary>
NotApplicable,
/// <summary>Driver claims Healthy + no pending work → stays Healthy.</summary>
Idle,
/// <summary>Driver claims Healthy + has pending work + has made progress within the threshold → stays Healthy.</summary>
Healthy,
/// <summary>Driver claims Healthy + has pending work + has NOT made progress within the threshold → wedged.</summary>
Faulted,
}

View File

@@ -0,0 +1,33 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<LangVersion>latest</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);CS1591</NoWarn>
<RootNamespace>ZB.MOM.WW.OtOpcUa.Core</RootNamespace>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
<ProjectReference Include="..\ZB.MOM.WW.OtOpcUa.Configuration\ZB.MOM.WW.OtOpcUa.Configuration.csproj"/>
</ItemGroup>
<ItemGroup>
<PackageReference Include="Polly.Core" Version="8.6.6"/>
<PackageReference Include="Serilog" Version="4.3.0"/>
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Core.Tests"/>
</ItemGroup>
<ItemGroup>
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-37gx-xxp4-5rgx"/>
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-w3x6-4m5h-cxqf"/>
</ItemGroup>
</Project>