chore: organize solution into module folders (Core/Server/Drivers/Client/Tooling)

Group all 69 projects into category subfolders under src/ and tests/ so the
Rider Solution Explorer mirrors the module structure. Folders: Core, Server,
Drivers (with a nested Driver CLIs subfolder), Client, Tooling.

- Move every project folder on disk with git mv (history preserved as renames).
- Recompute relative paths in 57 .csproj files: cross-category ProjectReferences,
  the lib/ HintPath+None refs in Driver.Historian.Wonderware, and the external
  mxaccessgw refs in Driver.Galaxy and its test project.
- Rebuild ZB.MOM.WW.OtOpcUa.slnx with nested solution folders.
- Re-prefix project paths in functional scripts (e2e, compliance, smoke SQL,
  integration, install).

Build green (0 errors); unit tests pass. Docs left for a separate pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-17 01:55:28 -04:00
parent 69f02fed7f
commit a25593a9c6
1044 changed files with 365 additions and 343 deletions

View File

@@ -0,0 +1,51 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Populates the five sub-attribute references on <see cref="AlarmConditionInfo"/>
/// by Galaxy convention. The server-level <c>AlarmConditionService</c> (PR 2.2) uses
/// these to subscribe to live alarm-state attributes and to route ack writes back to
/// the alarm tag.
/// </summary>
/// <remarks>
/// Galaxy alarms expose four runtime attributes plus a write-only ack target,
/// consistently named on every alarm-bearing object:
/// <list type="bullet">
/// <item><c>&lt;tag&gt;.&lt;attr&gt;.InAlarm</c></item>
/// <item><c>&lt;tag&gt;.&lt;attr&gt;.Priority</c></item>
/// <item><c>&lt;tag&gt;.&lt;attr&gt;.DescAttrName</c></item>
/// <item><c>&lt;tag&gt;.&lt;attr&gt;.Acked</c></item>
/// <item><c>&lt;tag&gt;.&lt;attr&gt;.AckMsg</c></item>
/// </list>
/// This is the same convention the legacy <c>GalaxyAlarmTracker</c> hard-coded; we
/// concentrate it here so PR 2.2's service receives complete <c>AlarmConditionInfo</c>
/// rows during discovery without the server needing to know the convention.
/// </remarks>
internal static class AlarmRefBuilder
{
private const string InAlarmSuffix = ".InAlarm";
private const string PrioritySuffix = ".Priority";
private const string DescAttrNameSuffix = ".DescAttrName";
private const string AckedSuffix = ".Acked";
private const string AckMsgSuffix = ".AckMsg";
/// <summary>
/// Build an <see cref="AlarmConditionInfo"/> for an alarm-bearing attribute with all
/// five sub-attribute references populated. <paramref name="fullReference"/> is the
/// attribute's full reference (e.g. <c>"Tank1.Level.HiHi"</c>); the convention prefixes
/// each suffix to it.
/// </summary>
public static AlarmConditionInfo Build(
string fullReference,
AlarmSeverity initialSeverity = AlarmSeverity.Medium,
string? initialDescription = null) => new(
SourceName: fullReference,
InitialSeverity: initialSeverity,
InitialDescription: initialDescription,
InAlarmRef: fullReference + InAlarmSuffix,
PriorityRef: fullReference + PrioritySuffix,
DescAttrNameRef: fullReference + DescAttrNameSuffix,
AckedRef: fullReference + AckedSuffix,
AckMsgWriteRef: fullReference + AckMsgSuffix);
}

View File

@@ -0,0 +1,23 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Maps Galaxy <c>mx_data_type</c> integer codes to <see cref="DriverDataType"/>.
/// Ported from the legacy <c>GalaxyProxyDriver.MapDataType</c> with the same fallback
/// to <see cref="DriverDataType.String"/> for unknown codes — keeps wire compatibility
/// with deployed configs while we tighten this through the parity matrix.
/// </summary>
internal static class DataTypeMap
{
public static DriverDataType Map(int mxDataType) => mxDataType switch
{
0 => DriverDataType.Boolean,
1 => DriverDataType.Int32,
2 => DriverDataType.Float32,
3 => DriverDataType.Float64,
4 => DriverDataType.String,
5 => DriverDataType.DateTime,
_ => DriverDataType.String,
};
}

View File

@@ -0,0 +1,232 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Contracts.Proto.Galaxy;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Long-lived consumer of <see cref="IGalaxyDeployWatchSource"/>. Translates
/// gateway <see cref="DeployEvent"/> stream into
/// <see cref="IRediscoverable.OnRediscoveryNeeded"/>-shaped events whenever the
/// observed <c>time_of_last_deploy</c> actually changes.
/// </summary>
/// <remarks>
/// <para>
/// The first event the gateway emits on subscribe is the bootstrap snapshot
/// carrying the current cached deploy time — even when the caller passed a
/// <c>lastSeenDeployTime</c>, a different gateway instance / cache invalidation
/// may still re-deliver it. The watcher therefore suppresses the first event
/// it observes locally, recording its (presence, time) pair as the baseline,
/// and only raises rediscover for subsequent events whose pair differs.
/// </para>
/// <para>
/// When <see cref="IGalaxyDeployWatchSource.WatchAsync"/> throws (transport
/// drop, gateway restart) the loop logs a warning, waits with capped
/// exponential backoff, then re-subscribes using the last-observed deploy time
/// so a reconnect doesn't fan out a redundant rediscover for state we already
/// knew about.
/// </para>
/// </remarks>
public sealed class DeployWatcher : IDisposable
{
private static readonly TimeSpan DefaultInitialBackoff = TimeSpan.FromSeconds(1);
private static readonly TimeSpan DefaultMaxBackoff = TimeSpan.FromSeconds(30);
private readonly IGalaxyDeployWatchSource _source;
private readonly ILogger _logger;
private readonly TimeSpan _initialBackoff;
private readonly TimeSpan _maxBackoff;
private readonly Func<int, TimeSpan>? _jitter;
private CancellationTokenSource? _cts;
private Task? _loopTask;
private int _started; // 0 = not started, 1 = started
/// <inheritdoc cref="IRediscoverable.OnRediscoveryNeeded"/>
public event EventHandler<RediscoveryEventArgs>? OnRediscoveryNeeded;
public DeployWatcher(IGalaxyDeployWatchSource source, ILogger? logger = null)
: this(source, logger, DefaultInitialBackoff, DefaultMaxBackoff, jitter: null)
{
}
/// <summary>
/// Test-only ctor lets tests collapse the retry backoff so a fault-injection
/// scenario doesn't sit in <see cref="Task.Delay(TimeSpan, CancellationToken)"/>.
/// </summary>
internal DeployWatcher(
IGalaxyDeployWatchSource source,
ILogger? logger,
TimeSpan initialBackoff,
TimeSpan maxBackoff,
Func<int, TimeSpan>? jitter)
{
_source = source ?? throw new ArgumentNullException(nameof(source));
_logger = logger ?? NullLogger.Instance;
_initialBackoff = initialBackoff;
_maxBackoff = maxBackoff;
_jitter = jitter;
}
/// <summary>
/// Kicks off the background watch loop. Returns immediately once the loop task
/// has been scheduled — the loop itself runs until <see cref="StopAsync"/> or
/// the supplied <paramref name="cancellationToken"/> is signaled.
/// </summary>
public Task StartAsync(CancellationToken cancellationToken)
{
if (Interlocked.Exchange(ref _started, 1) != 0)
{
throw new InvalidOperationException(
"DeployWatcher.StartAsync has already been called. Construct a new instance to restart.");
}
_cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
_loopTask = Task.Run(() => RunLoopAsync(_cts.Token), CancellationToken.None);
return Task.CompletedTask;
}
/// <summary>Cancels the loop and waits for it to exit cleanly.</summary>
public async Task StopAsync()
{
var cts = _cts;
var loop = _loopTask;
if (cts is null || loop is null) return;
try { cts.Cancel(); } catch (ObjectDisposedException) { }
try
{
await loop.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Expected: cancellation propagated up from the source enumerator.
}
finally
{
cts.Dispose();
_cts = null;
_loopTask = null;
}
}
public void Dispose()
{
if (_loopTask is null) return;
StopAsync().GetAwaiter().GetResult();
}
private async Task RunLoopAsync(CancellationToken cancellationToken)
{
DateTimeOffset? lastSeenDeployTime = null;
bool? lastSeenPresent = null;
bool baselineCaptured = false;
TimeSpan backoff = _initialBackoff;
int attempt = 0;
while (!cancellationToken.IsCancellationRequested)
{
try
{
await foreach (DeployEvent ev in _source
.WatchAsync(lastSeenDeployTime, cancellationToken)
.WithCancellation(cancellationToken)
.ConfigureAwait(false))
{
// Successful read — reset retry state.
backoff = _initialBackoff;
attempt = 0;
DateTimeOffset? observedTime = ev.TimeOfLastDeployPresent && ev.TimeOfLastDeploy is not null
? ev.TimeOfLastDeploy.ToDateTimeOffset()
: null;
bool observedPresent = ev.TimeOfLastDeployPresent;
if (!baselineCaptured)
{
// Bootstrap event — record state and suppress.
baselineCaptured = true;
lastSeenDeployTime = observedTime;
lastSeenPresent = observedPresent;
_logger.LogDebug(
"DeployWatcher bootstrap event sequence={Sequence} present={Present} time={Time} suppressed.",
ev.Sequence, observedPresent, observedTime);
continue;
}
bool presenceFlipped = lastSeenPresent != observedPresent;
bool timeChanged = observedPresent && lastSeenDeployTime != observedTime;
if (!presenceFlipped && !timeChanged)
{
_logger.LogDebug(
"DeployWatcher event sequence={Sequence} matches last-seen state; skipping rediscover.",
ev.Sequence);
continue;
}
lastSeenDeployTime = observedTime;
lastSeenPresent = observedPresent;
string? scopeHint = observedTime?.ToString("O");
var args = new RediscoveryEventArgs("deploy-time-changed", scopeHint);
_logger.LogInformation(
"DeployWatcher raising rediscover sequence={Sequence} reason={Reason} scopeHint={ScopeHint}.",
ev.Sequence, args.Reason, args.ScopeHint);
try
{
OnRediscoveryNeeded?.Invoke(this, args);
}
catch (Exception handlerEx)
{
_logger.LogError(handlerEx,
"DeployWatcher subscriber threw while handling rediscover; continuing.");
}
}
// Stream completed normally — gateway closed the subscription. Re-open
// immediately if we weren't asked to stop.
_logger.LogDebug("DeployWatcher stream completed; re-subscribing.");
continue;
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
attempt++;
TimeSpan jitterAmount = _jitter?.Invoke(attempt) ?? RandomJitter(backoff);
TimeSpan delay = backoff + jitterAmount;
_logger.LogWarning(ex,
"DeployWatcher source threw; retrying in {Delay} (attempt {Attempt}, last-seen time {LastSeen}).",
delay, attempt, lastSeenDeployTime);
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
// Exponential backoff capped at _maxBackoff.
var doubled = TimeSpan.FromTicks(Math.Min(_maxBackoff.Ticks, backoff.Ticks * 2));
backoff = doubled < _initialBackoff ? _initialBackoff : doubled;
}
}
}
private static TimeSpan RandomJitter(TimeSpan baseDelay)
{
// Up to +/- 25% of the base delay, biased non-negative.
long maxTicks = Math.Max(1L, baseDelay.Ticks / 4);
long ticks = Random.Shared.NextInt64(0, maxTicks);
return TimeSpan.FromTicks(ticks);
}
}

View File

@@ -0,0 +1,91 @@
using MxGateway.Contracts.Proto.Galaxy;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Translates a Galaxy object hierarchy (from <see cref="IGalaxyHierarchySource"/>) into
/// <see cref="IAddressSpaceBuilder"/> calls — folders for each gobject, variables for
/// each dynamic attribute. Alarm-bearing attributes get all five sub-attribute refs
/// populated via <see cref="AlarmRefBuilder"/> so the server-level alarm subsystem
/// (PR 2.2) can subscribe + ack without help from the driver.
/// </summary>
/// <remarks>
/// Hierarchy materialisation rules (mirror legacy <c>MxAccessGalaxyBackend.DiscoverAsync</c>):
/// <list type="bullet">
/// <item>Browse name = <c>contained_name</c> when present; falls back to <c>tag_name</c>.</item>
/// <item>Folder per gobject; variables placed inside their owner folder.</item>
/// <item>Variable's full reference = <c>tag_name.attribute_name</c> — the format MXAccess
/// expects for read/write addressing (translated from the contained-name browse path).</item>
/// <item>Hierarchy is rendered flat (one folder per gobject under the driver root) for
/// this PR. PR 4.W's address-space wiring revisits whether to nest under
/// <c>parent_gobject_id</c> for a true tree shape.</item>
/// </list>
/// </remarks>
public sealed class GalaxyDiscoverer
{
private readonly IGalaxyHierarchySource _source;
public GalaxyDiscoverer(IGalaxyHierarchySource source)
{
_source = source ?? throw new ArgumentNullException(nameof(source));
}
/// <summary>
/// Drive the supplied builder with one folder + N variables per Galaxy object the
/// gateway returns. Idempotent — caller can re-invoke after a redeploy event.
/// </summary>
public async Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(builder);
var objects = await _source.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
foreach (var obj in objects)
{
var browseName = string.IsNullOrEmpty(obj.ContainedName) ? obj.TagName : obj.ContainedName;
if (string.IsNullOrEmpty(browseName)) continue; // skip objects with no usable identity
var folder = builder.Folder(browseName, browseName);
foreach (var attr in obj.Attributes)
{
if (string.IsNullOrEmpty(attr.AttributeName)) continue;
var fullReference = !string.IsNullOrEmpty(attr.FullTagReference)
? StripArraySuffix(attr.FullTagReference)
: obj.TagName + "." + attr.AttributeName;
var info = new DriverAttributeInfo(
FullName: fullReference,
DriverDataType: DataTypeMap.Map(attr.MxDataType),
IsArray: attr.IsArray,
ArrayDim: attr.IsArray && attr.ArrayDimensionPresent && attr.ArrayDimension > 0
? (uint)attr.ArrayDimension
: null,
SecurityClass: SecurityMap.Map(attr.SecurityClassification),
IsHistorized: attr.IsHistorized,
IsAlarm: attr.IsAlarm);
var handle = folder.Variable(attr.AttributeName, attr.AttributeName, info);
// Alarm-bearing attributes ship the full sub-attribute ref set so the server's
// AlarmConditionService can subscribe + ack-write without re-deriving the names.
if (attr.IsAlarm)
{
handle.MarkAsAlarmCondition(AlarmRefBuilder.Build(fullReference));
}
}
}
}
// PR 5.W workaround for mxaccessgw GalaxyRepository.cs:173-175 — the gateway's
// SQL appends `[]` to array-typed `full_tag_reference` values, but MxAccess COM
// `IInstance.AddItem` doesn't accept `[]`-suffixed addresses (so any downstream
// Subscribe/Read/Write through the worker would fail with the suffixed form).
// Strip defensively here so the parity matrix can run today; remove once the
// gw fix (mxaccessgw/requirements-array-suffix-fix.md) lands.
private static string StripArraySuffix(string fullReference) =>
fullReference.EndsWith("[]", StringComparison.Ordinal)
? fullReference[..^2]
: fullReference;
}

View File

@@ -0,0 +1,26 @@
using MxGateway.Client;
using MxGateway.Contracts.Proto.Galaxy;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Default <see cref="IGalaxyDeployWatchSource"/> wrapping the gateway's
/// <see cref="GalaxyRepositoryClient"/>. Forwards
/// <c>WatchDeployEventsAsync(lastSeenDeployTime, ct)</c> verbatim — paging /
/// bootstrap suppression policy lives on the gateway, while
/// <see cref="DeployWatcher"/> owns the change-detection and reconnect-loop
/// concerns above this seam.
/// </summary>
public sealed class GatewayGalaxyDeployWatchSource : IGalaxyDeployWatchSource
{
private readonly GalaxyRepositoryClient _client;
public GatewayGalaxyDeployWatchSource(GalaxyRepositoryClient client)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
}
public IAsyncEnumerable<DeployEvent> WatchAsync(
DateTimeOffset? lastSeenDeployTime, CancellationToken cancellationToken)
=> _client.WatchDeployEventsAsync(lastSeenDeployTime, cancellationToken);
}

View File

@@ -0,0 +1,21 @@
using MxGateway.Client;
using MxGateway.Contracts.Proto.Galaxy;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Default <see cref="IGalaxyHierarchySource"/> wrapping the gateway's
/// <see cref="GalaxyRepositoryClient"/>. Pages internally via the client's overload.
/// </summary>
public sealed class GatewayGalaxyHierarchySource : IGalaxyHierarchySource
{
private readonly GalaxyRepositoryClient _client;
public GatewayGalaxyHierarchySource(GalaxyRepositoryClient client)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
}
public Task<IReadOnlyList<GalaxyObject>> GetHierarchyAsync(CancellationToken cancellationToken)
=> _client.DiscoverHierarchyAsync(cancellationToken);
}

View File

@@ -0,0 +1,24 @@
using MxGateway.Contracts.Proto.Galaxy;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Driver-side seam between <see cref="DeployWatcher"/> and the gateway. Production
/// wraps <c>GalaxyRepositoryClient.WatchDeployEventsAsync</c>; tests substitute a fake
/// yielding controlled <see cref="DeployEvent"/> instances so the watcher's bootstrap
/// suppression, change detection, reconnect, and shutdown semantics can be exercised
/// without a real gRPC stream.
/// </summary>
public interface IGalaxyDeployWatchSource
{
/// <summary>
/// Subscribe to Galaxy deploy events. The server emits a bootstrap event with the
/// current cached state on subscribe, then one event per new
/// <c>time_of_last_deploy</c>. Pass <paramref name="lastSeenDeployTime"/> to ask the
/// gateway to suppress its bootstrap when the caller already has the current value;
/// <see cref="DeployWatcher"/> still suppresses the first event it observes locally
/// so a transport reconnect doesn't re-fire on identical state.
/// </summary>
IAsyncEnumerable<DeployEvent> WatchAsync(
DateTimeOffset? lastSeenDeployTime, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,19 @@
using MxGateway.Contracts.Proto.Galaxy;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Driver-side seam between <see cref="GalaxyDiscoverer"/> and the gateway. Production
/// wraps <c>GalaxyRepositoryClient</c>; tests substitute a fake returning canned
/// <see cref="GalaxyObject"/> rows so the discoverer's translation logic can be exercised
/// without a real gRPC channel.
/// </summary>
public interface IGalaxyHierarchySource
{
/// <summary>
/// Returns the full materialised Galaxy hierarchy. The gateway client pages
/// internally; this interface deliberately exposes only the post-paging shape so
/// callers don't reimplement paging.
/// </summary>
Task<IReadOnlyList<GalaxyObject>> GetHierarchyAsync(CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,25 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// Maps Galaxy <c>security_classification</c> integer codes to
/// <see cref="SecurityClassification"/>. Ported from the legacy
/// <c>GalaxyProxyDriver.MapSecurity</c>; unknown codes fall back to
/// <see cref="SecurityClassification.FreeAccess"/> so a forward-compatible Galaxy
/// deployment with new classifications doesn't break discovery.
/// </summary>
internal static class SecurityMap
{
public static SecurityClassification Map(int mxSec) => mxSec switch
{
0 => SecurityClassification.FreeAccess,
1 => SecurityClassification.Operate,
2 => SecurityClassification.SecuredWrite,
3 => SecurityClassification.VerifiedWrite,
4 => SecurityClassification.Tune,
5 => SecurityClassification.Configure,
6 => SecurityClassification.ViewOnly,
_ => SecurityClassification.FreeAccess,
};
}

View File

@@ -0,0 +1,30 @@
using MxGateway.Contracts.Proto.Galaxy;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
/// <summary>
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
/// per <c>GetHierarchy</c> RPC. <c>galaxy.object_count</c> on the span lets ops
/// correlate slow Discover passes with Galaxy size without instrumenting the
/// discoverer's translation step.
/// </summary>
internal sealed class TracedGalaxyHierarchySource(IGalaxyHierarchySource inner, string clientName) : IGalaxyHierarchySource
{
public async Task<IReadOnlyList<GalaxyObject>> GetHierarchyAsync(CancellationToken cancellationToken)
{
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.get_hierarchy");
activity?.SetTag("galaxy.client", clientName);
try
{
var hierarchy = await inner.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
activity?.SetTag("galaxy.object_count", hierarchy.Count);
return hierarchy;
}
catch (Exception ex)
{
activity.RecordError(ex);
throw;
}
}
}

View File

@@ -0,0 +1,86 @@
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
/// <summary>
/// Driver-instance options for the in-process .NET 10 Galaxy driver. Maps to the
/// <c>DriverConfig</c> JSON column on the central config DB. Decomposed into nested
/// records so the JSON structure mirrors the runtime shape and operators can target
/// individual sections (gateway endpoint, mxaccess client identity, reconnect policy)
/// without touching the rest.
/// </summary>
/// <param name="Gateway">Connection details for the MxAccess gateway (mxaccessgw repo).</param>
/// <param name="MxAccess">MXAccess-specific knobs surfaced through gw — client name, publishing interval, write-user.</param>
/// <param name="Repository">Galaxy Repository browse options consumed by the discoverer.</param>
/// <param name="Reconnect">Backoff knobs for the in-driver reconnect supervisor (PR 4.5).</param>
public sealed record GalaxyDriverOptions(
GalaxyGatewayOptions Gateway,
GalaxyMxAccessOptions MxAccess,
GalaxyRepositoryOptions Repository,
GalaxyReconnectOptions Reconnect);
/// <summary>
/// Connection details for the MxAccess gateway. <see cref="ApiKeySecretRef"/> resolves
/// through the server-side secret store (DPAPI for production, environment override for
/// dev) — the API key never appears in cleartext config.
/// </summary>
// PR 6.5 tuning notes:
// ConnectTimeoutSeconds = 10 — cold-start network path comfort margin; soak runs
// never saw a successful connect take >2s, so 10s is generous without being lax.
// DefaultCallTimeoutSeconds = 30 — bumped from 5s because a 50k-tag SubscribeBulk
// can exceed 5s under MxAccess COM contention (the worker walks the gw item list
// serially under the apartment lock). 30s leaves comfortable headroom for the
// legitimate worst case while still failing fast on a wedged worker.
// StreamTimeoutSeconds = 0 — unlimited; the StreamEvents RPC must run for the
// lifetime of the driver. Set a finite value only for diagnostic runs.
public sealed record GalaxyGatewayOptions(
string Endpoint,
string ApiKeySecretRef,
bool UseTls = true,
string? CaCertificatePath = null,
int ConnectTimeoutSeconds = 10,
int DefaultCallTimeoutSeconds = 30,
int StreamTimeoutSeconds = 0);
/// <summary>
/// MXAccess-specific knobs the gateway forwards to the worker process.
/// </summary>
/// <param name="ClientName">
/// Wonderware client identity. MUST be unique per OtOpcUa instance — when two instances
/// share a name, the older session loses subscription state. Redundancy pairs (decision
/// #149) enforce uniqueness via install scripts.
/// </param>
/// <param name="PublishingIntervalMs">
/// Hint forwarded as <c>buffered_update_interval_ms</c> on subscribe; lets the worker
/// coalesce updates at the OPC UA publishing cadence rather than every COM tick.
/// </param>
/// <param name="WriteUserId">
/// Reserved for ArchestrA secured-write user mapping; PR 4.3 wires <c>WriteSecured</c>
/// routing against this id. 0 = anonymous.
/// </param>
/// <param name="EventPumpChannelCapacity">
/// Bounded-channel size between the EventPump's network-read loop and its listener
/// fan-out loop (PR 6.2). Default 50_000 = one second of headroom at 50k tags / 1Hz;
/// raise it when <c>galaxy.events.dropped</c> shows up under transient consumer
/// slowness, lower it on a memory-tight host where the headroom isn't needed.
/// </param>
public sealed record GalaxyMxAccessOptions(
string ClientName,
int PublishingIntervalMs = 1000,
int WriteUserId = 0,
int EventPumpChannelCapacity = 50_000);
/// <summary>
/// Galaxy Repository browse-side knobs consumed by PR 4.1's <c>GalaxyDiscoverer</c>.
/// </summary>
public sealed record GalaxyRepositoryOptions(
int DiscoverPageSize = 5000,
bool WatchDeployEvents = true);
/// <summary>
/// Backoff knobs for the in-driver reconnect supervisor (PR 4.5). Replay-on-session-lost
/// calls the gw's <c>ReplaySubscriptions</c> RPC after reconnect rather than re-issuing
/// subscribe-bulk for every tag.
/// </summary>
public sealed record GalaxyReconnectOptions(
int InitialBackoffMs = 500,
int MaxBackoffMs = 30_000,
bool ReplayOnSessionLost = true);

View File

@@ -0,0 +1,960 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Client;
using MxGateway.Contracts.Proto;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// In-process .NET 10 Galaxy driver — the v2 replacement for the Galaxy.Host /
/// Galaxy.Proxy pair. PR 4.0 ships the project skeleton with <see cref="IDriver"/>
/// bodies that wire to a future <c>IGalaxyGatewayClient</c> abstraction. Capability
/// interfaces (browse, read, write, subscribe, history routing, host probes) land in
/// PRs 4.14.7; the wiring sequence keeps every intermediate state buildable so the
/// <c>Galaxy:Backend</c> flag (PR 4.W) can flip between legacy-host and mxgateway
/// for parity testing.
/// </summary>
/// <remarks>
/// This driver is registered as a Tier A in-process driver alongside Modbus / S7 / etc.
/// The legacy <c>GalaxyProxyDriver</c> (Driver.Galaxy.Proxy) coexists until PR 7.2;
/// <see cref="GalaxyDriverFactoryExtensions"/> registers under driver-type name
/// "GalaxyMxGateway" so both paths can be live simultaneously during parity testing.
/// </remarks>
public sealed class GalaxyDriver
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable
{
private readonly string _driverInstanceId;
private readonly GalaxyDriverOptions _options;
private readonly ILogger<GalaxyDriver> _logger;
// PR 4.1 — IGalaxyHierarchySource is the test seam for browse. When null, the driver
// lazily builds a GatewayGalaxyHierarchySource around a GalaxyRepositoryClient on
// first DiscoverAsync. Tests inject a fake source via the internal ctor to exercise
// GalaxyDiscoverer's translation logic without a real gRPC channel.
private IGalaxyHierarchySource? _hierarchySource;
private GalaxyRepositoryClient? _ownedRepositoryClient;
// PR 4.2 — IGalaxyDataReader is the test seam for IReadable. PR 4.4 supplies the
// production implementation that wraps GalaxyMxSession's SubscribeBulk + StreamEvents
// pump; until then ReadAsync throws NotSupportedException when the reader is null
// (legacy-host backend handles reads in production via DriverNodeManager's
// capability-routing).
private IGalaxyDataReader? _dataReader;
// PR 4.3 — IGalaxyDataWriter is the test seam for IWritable. Production wraps
// GalaxyMxSession via GatewayGalaxyDataWriter (Write / WriteSecured routing). The
// per-tag SecurityClassification map is populated during ITagDiscovery and consumed
// here at write time.
private IGalaxyDataWriter? _dataWriter;
private readonly System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification>
_securityByFullRef = new(StringComparer.OrdinalIgnoreCase);
// PR 4.4 — subscription lifecycle. The pump consumes the gw event stream and fans
// out OnDataChange events to every registered driver subscription via the registry's
// reverse map. The subscriber is the test seam — production uses
// GatewayGalaxySubscriber over a connected GalaxyMxSession.
private IGalaxySubscriber? _subscriber;
private readonly SubscriptionRegistry _subscriptions = new();
private EventPump? _eventPump;
private readonly Lock _pumpLock = new();
// PR B.2 — IAlarmSource implementation. Production-side acks route through
// GatewayGalaxyAlarmAcknowledger which calls MxGatewayClient.AcknowledgeAlarmAsync
// (PR E.2 SDK). Tests inject IGalaxyAlarmAcknowledger via the internal ctor to
// exercise the wiring without a running gateway. The alarm event stream is
// delivered by EventPump.OnAlarmTransition (PR B.1) — this driver is the
// consumer that bridges it onto IAlarmSource.OnAlarmEvent.
private IGalaxyAlarmAcknowledger? _alarmAcknowledger;
private readonly Lock _alarmHandlersLock = new();
private readonly HashSet<GalaxyAlarmSubscriptionHandle> _alarmSubscriptions = new();
// PR 4.W — production runtime owned by InitializeAsync. The driver builds these
// when it opens a real gw session; tests bypass them by injecting seams via the
// internal ctor.
private GalaxyMxSession? _ownedMxSession;
private MxGatewayClient? _ownedMxClient;
// PR 4.5 — reconnect supervisor. Reflects in DriverState.Degraded while not Healthy.
private ReconnectSupervisor? _supervisor;
// PR 4.6 — IRediscoverable plumbing.
private DeployWatcher? _deployWatcher;
// PR 4.7 — IHostConnectivityProbe plumbing. The aggregator owns the merged
// transport+per-platform view; the forwarder is fed from the supervisor on
// transport state transitions; the probe watcher subscribes ScanState attributes
// for every discovered platform and pushes value changes to the aggregator.
private readonly HostStatusAggregator _hostStatuses = new();
private HostConnectivityForwarder? _transportForwarder;
private PerPlatformProbeWatcher? _probeWatcher;
private DriverHealth _health = new(DriverState.Unknown, null, null);
private bool _disposed;
/// <summary>
/// Server-pushed data-change notification. Fires from the
/// <see cref="EventPump"/>'s background loop; handlers should be cheap (or queue
/// onto another thread) to avoid blocking the gw event stream.
/// </summary>
public event EventHandler<DataChangeEventArgs>? OnDataChange;
/// <summary>Fires when the gateway signals a deploy-time change (PR 4.6 DeployWatcher).</summary>
public event EventHandler<RediscoveryEventArgs>? OnRediscoveryNeeded;
/// <summary>Fires when a host transitions Running ↔ Stopped (PR 4.7 HostStatusAggregator).</summary>
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
/// <inheritdoc />
public event EventHandler<AlarmEventArgs>? OnAlarmEvent;
public GalaxyDriver(
string driverInstanceId,
GalaxyDriverOptions options,
ILogger<GalaxyDriver>? logger = null)
: this(driverInstanceId, options,
hierarchySource: null, dataReader: null, dataWriter: null, subscriber: null,
alarmAcknowledger: null, logger)
{
}
/// <summary>
/// Test-visible ctor — inject custom seams so <see cref="DiscoverAsync"/>,
/// <see cref="ReadAsync"/>, <see cref="WriteAsync"/>, and
/// <see cref="SubscribeAsync"/> can be exercised against canned data without
/// building real gRPC channels.
/// </summary>
internal GalaxyDriver(
string driverInstanceId,
GalaxyDriverOptions options,
IGalaxyHierarchySource? hierarchySource,
IGalaxyDataReader? dataReader = null,
IGalaxyDataWriter? dataWriter = null,
IGalaxySubscriber? subscriber = null,
IGalaxyAlarmAcknowledger? alarmAcknowledger = null,
ILogger<GalaxyDriver>? logger = null)
{
_driverInstanceId = !string.IsNullOrWhiteSpace(driverInstanceId)
? driverInstanceId
: throw new ArgumentException("Driver instance id required.", nameof(driverInstanceId));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? NullLogger<GalaxyDriver>.Instance;
_hierarchySource = hierarchySource;
_dataReader = dataReader;
_dataWriter = dataWriter;
_subscriber = subscriber;
_alarmAcknowledger = alarmAcknowledger;
// Forward the aggregator's transitions through IHostConnectivityProbe.
_hostStatuses.OnHostStatusChanged += (_, args) => OnHostStatusChanged?.Invoke(this, args);
}
/// <inheritdoc />
public string DriverInstanceId => _driverInstanceId;
/// <inheritdoc />
public string DriverType => GalaxyDriverFactoryExtensions.DriverTypeName;
/// <summary>Test-visible options snapshot.</summary>
internal GalaxyDriverOptions Options => _options;
/// <inheritdoc />
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
// Tests inject seams via the internal ctor; production InitializeAsync builds
// the gateway client + session + per-capability runtime components from
// GalaxyDriverOptions. When seams are pre-injected we leave them alone (the
// test exercises the wired surface without a real gw round-trip).
if (_subscriber is null && _dataWriter is null && _hierarchySource is null)
{
await BuildProductionRuntimeAsync(cancellationToken).ConfigureAwait(false);
}
else
{
_logger.LogDebug(
"GalaxyDriver {InstanceId} initializing with pre-injected seams — production runtime build skipped",
_driverInstanceId);
}
StartDeployWatcher();
_logger.LogInformation(
"GalaxyDriver {InstanceId} initialized — endpoint={Endpoint} clientName={ClientName}",
_driverInstanceId, _options.Gateway.Endpoint, _options.MxAccess.ClientName);
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
}
/// <summary>
/// Build the production gw client + session + per-capability runtime components
/// from <c>_options</c>. Sets up the reconnect supervisor's reopen / replay
/// callbacks so a transport drop replays every active subscription on the
/// restored session.
/// </summary>
private async Task BuildProductionRuntimeAsync(CancellationToken cancellationToken)
{
var clientOptions = BuildClientOptions(_options.Gateway);
_ownedMxClient = MxGatewayClient.Create(clientOptions);
_ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
// PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe /
// Unsubscribe / Write / StreamEvents call emits a span on the
// "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing
// listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver
// taking a dependency on the OpenTelemetry packages.
_subscriber = new TracedGalaxySubscriber(
new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName);
_dataWriter = new TracedGalaxyDataWriter(
new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger),
_options.MxAccess.ClientName);
_supervisor = new ReconnectSupervisor(
reopen: ReopenAsync,
replay: ReplayAsync,
options: new ReconnectOptions(
InitialBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.InitialBackoffMs),
MaxBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.MaxBackoffMs)),
logger: _logger);
_transportForwarder = new HostConnectivityForwarder(_options.MxAccess.ClientName, _hostStatuses, _logger);
_transportForwarder.SetTransport(HostState.Running); // initial state — we just connected
_supervisor.StateChanged += OnSupervisorStateChanged;
_probeWatcher = new PerPlatformProbeWatcher(
_subscriber, _hostStatuses, _logger,
bufferedUpdateIntervalMs: _options.MxAccess.PublishingIntervalMs);
// PR B.2 — wire the alarm acknowledger to the live gateway client.
_alarmAcknowledger ??= new GatewayGalaxyAlarmAcknowledger(_ownedMxClient, _ownedMxSession, _logger);
}
/// <summary>
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
/// reconnect against the existing client.
/// </summary>
private async Task ReopenAsync(CancellationToken cancellationToken)
{
if (_ownedMxSession is null) return;
var clientOptions = BuildClientOptions(_options.Gateway);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
/// Replay callback. Walks every active subscription's bindings and re-issues
/// SubscribeBulk for the tag list. PR 6.x can swap this for the gw's batched
/// <c>ReplaySubscriptionsCommand</c> once it ships.
/// </summary>
private async Task ReplayAsync(CancellationToken cancellationToken)
{
if (_subscriber is null) return;
var bindings = _subscriptions.SnapshotAllBindings();
if (bindings.Count == 0) return;
var refs = bindings.Select(b => b.FullReference).Distinct(StringComparer.OrdinalIgnoreCase).ToArray();
await _subscriber.SubscribeBulkAsync(
refs, _options.MxAccess.PublishingIntervalMs, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"GalaxyDriver {InstanceId} replay completed — {Count} tags re-subscribed",
_driverInstanceId, refs.Length);
}
private void OnSupervisorStateChanged(object? sender, StateTransition transition)
{
// Reflect supervisor state in DriverHealth + transport forwarder.
_health = transition.Next switch
{
ReconnectSupervisor.State.Healthy => new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null),
_ => new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, transition.Cause),
};
if (_transportForwarder is not null)
{
var hostState = transition.Next == ReconnectSupervisor.State.Healthy
? HostState.Running
: HostState.Stopped;
_transportForwarder.SetTransport(hostState);
}
}
/// <summary>
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Three
/// forms supported, evaluated in order:
/// <list type="number">
/// <item><c>env:NAME</c> — reads <c>Environment.GetEnvironmentVariable(NAME)</c>.
/// Throws when the variable is unset, so a misconfigured deployment fails
/// fast at InitializeAsync rather than silently sending an empty key.</item>
/// <item><c>file:PATH</c> — reads UTF-8 text from <c>PATH</c>, trimming
/// whitespace. Lets operators stash the key in an ACL'd file outside the
/// repo (the same pattern as the legacy <c>.local/galaxy-host-secret.txt</c>).</item>
/// <item>Anything else — used as the literal API key. Convenient for dev,
/// and avoids breaking existing configs that pre-date this resolver.</item>
/// </list>
/// A future PR can swap any of these arms for a DPAPI-backed lookup without
/// changing the call site.
/// </summary>
internal static string ResolveApiKey(string secretRef)
{
ArgumentException.ThrowIfNullOrEmpty(secretRef);
if (secretRef.StartsWith("env:", StringComparison.OrdinalIgnoreCase))
{
var name = secretRef[4..];
var value = Environment.GetEnvironmentVariable(name);
return !string.IsNullOrEmpty(value)
? value
: throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' resolves to env var '{name}', but it is unset.");
}
if (secretRef.StartsWith("file:", StringComparison.OrdinalIgnoreCase))
{
var path = secretRef[5..];
if (!File.Exists(path))
{
throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' points at '{path}', which doesn't exist.");
}
var contents = File.ReadAllText(path).Trim();
return !string.IsNullOrEmpty(contents)
? contents
: throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty.");
}
return secretRef;
}
private static MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = ResolveApiKey(gw.ApiKeySecretRef),
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
DefaultCallTimeout = TimeSpan.FromSeconds(gw.DefaultCallTimeoutSeconds),
StreamTimeout = gw.StreamTimeoutSeconds > 0 ? TimeSpan.FromSeconds(gw.StreamTimeoutSeconds) : null,
};
private void StartDeployWatcher()
{
if (!_options.Repository.WatchDeployEvents) return;
if (_ownedRepositoryClient is null && _hierarchySource is null) return;
// Reuse the lazily-built repository client (DiscoverAsync constructs it on demand).
// If discovery hasn't run yet, build the client here so the watcher has a target.
if (_ownedRepositoryClient is null)
{
_ownedRepositoryClient = MxGateway.Client.GalaxyRepositoryClient.Create(
BuildClientOptions(_options.Gateway));
}
var source = new GatewayGalaxyDeployWatchSource(_ownedRepositoryClient);
_deployWatcher = new DeployWatcher(source, _logger);
_deployWatcher.OnRediscoveryNeeded += (_, args) => OnRediscoveryNeeded?.Invoke(this, args);
_ = _deployWatcher.StartAsync(CancellationToken.None);
}
/// <inheritdoc />
public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
// In-place config reapply. PR 4.5's reconnect supervisor will swap the
// gateway-client options under the lock; for the skeleton we just refresh health.
ObjectDisposedException.ThrowIf(_disposed, this);
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
return Task.CompletedTask;
}
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken)
{
if (_disposed) return Task.CompletedTask;
_logger.LogInformation("GalaxyDriver {InstanceId} shutting down", _driverInstanceId);
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
return Task.CompletedTask;
}
/// <inheritdoc />
public DriverHealth GetHealth()
{
// Reconnect supervisor wins when degraded — the cached _health reflects the last
// successful operation, but ongoing recovery should surface as Degraded.
if (_supervisor?.IsDegraded == true)
{
return new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, _supervisor.LastError);
}
return _health;
}
// ===== IHostConnectivityProbe (PR 4.7 wire-up) =====
/// <inheritdoc />
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses() => _hostStatuses.Snapshot();
/// <inheritdoc />
public long GetMemoryFootprint() => 0; // PR 4.4 sets this from SubscriptionRegistry size.
/// <inheritdoc />
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
// ===== ITagDiscovery (PR 4.1) =====
/// <inheritdoc />
public async Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(builder);
// PR 4.3 — capture SecurityClassification per attribute. PR 4.W — also refresh
// the per-platform probe watcher's membership after discovery so newly-added
// $WinPlatform / $AppEngine objects start advising their ScanState attribute.
var capturingBuilder = new SecurityCapturingBuilder(builder, _securityByFullRef);
var source = _hierarchySource ??= BuildDefaultHierarchySource();
var discoverer = new GalaxyDiscoverer(source);
await discoverer.DiscoverAsync(capturingBuilder, cancellationToken).ConfigureAwait(false);
if (_probeWatcher is not null)
{
var hierarchy = await source.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
var platforms = hierarchy
.Where(o => o.TemplateChain.Any(t =>
string.Equals(t, "$WinPlatform", StringComparison.OrdinalIgnoreCase)
|| string.Equals(t, "$AppEngine", StringComparison.OrdinalIgnoreCase)))
.Select(o => o.TagName)
.Where(name => !string.IsNullOrEmpty(name));
await _probeWatcher.SyncPlatformsAsync(platforms, cancellationToken).ConfigureAwait(false);
}
}
private SecurityClassification ResolveSecurity(string fullReference) =>
_securityByFullRef.TryGetValue(fullReference, out var sec) ? sec : SecurityClassification.FreeAccess;
// ===== IReadable =====
/// <inheritdoc />
public Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(fullReferences);
if (fullReferences.Count == 0) return Task.FromResult<IReadOnlyList<DataValueSnapshot>>([]);
if (_dataReader is not null)
{
// Test-only path — tests inject a canned reader via the internal ctor.
return _dataReader.ReadAsync(fullReferences, cancellationToken);
}
if (_subscriber is null)
{
throw new NotSupportedException(
"GalaxyDriver.ReadAsync requires a connected GalaxyMxSession (production runtime not built). " +
"Either inject a test seam via the internal ctor or call InitializeAsync against a real gateway.");
}
return ReadViaSubscribeOnceAsync(fullReferences, cancellationToken);
}
/// <summary>
/// Production read path. MxAccess has no one-shot Read RPC — every value comes
/// through the event stream. We synthesise a Read by:
/// <list type="number">
/// <item>Subscribing the requested tags through the existing
/// <see cref="SubscriptionRegistry"/> + <see cref="EventPump"/>.</item>
/// <item>Waiting for the first <c>OnDataChange</c> per item handle (the gateway
/// pushes the current value as the initial event after a SubscribeBulk).</item>
/// <item>Unsubscribing.</item>
/// </list>
/// Tags the gw rejects at SubscribeBulk time, or that never publish before the
/// caller's cancellation token fires, return a Bad-status snapshot in input order
/// so the caller still sees one snapshot per requested reference.
/// </summary>
private async Task<IReadOnlyList<DataValueSnapshot>> ReadViaSubscribeOnceAsync(
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
{
var pump = EnsureEventPumpStarted();
var subscriptionId = _subscriptions.NextSubscriptionId();
// Pre-allocate one TaskCompletionSource per full-reference so the OnDataChange
// handler can complete them out-of-order as events arrive. Wired BEFORE the
// SubscribeBulk call so we don't race with the first event the gw pushes.
var pendingByRef = new Dictionary<string, TaskCompletionSource<DataValueSnapshot>>(
StringComparer.OrdinalIgnoreCase);
foreach (var fullRef in fullReferences.Distinct(StringComparer.OrdinalIgnoreCase))
{
pendingByRef[fullRef] = new TaskCompletionSource<DataValueSnapshot>(
TaskCreationOptions.RunContinuationsAsynchronously);
}
EventHandler<DataChangeEventArgs> handler = (_, args) =>
{
// Filter to OUR subscription — the pump's OnDataChange fans out across all
// subscriptions on the driver, and we don't want a parallel ISubscribable
// caller's events to leak into our read.
if (args.SubscriptionHandle is GalaxySubscriptionHandle gsh
&& gsh.SubscriptionId == subscriptionId
&& pendingByRef.TryGetValue(args.FullReference, out var tcs))
{
tcs.TrySetResult(args.Snapshot);
}
};
pump.OnDataChange += handler;
var bufferedIntervalMs = _options.MxAccess.PublishingIntervalMs;
IReadOnlyList<SubscribeResult> results;
try
{
results = await _subscriber!
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
.ConfigureAwait(false);
}
catch
{
pump.OnDataChange -= handler;
throw;
}
// Register bindings so the pump knows to dispatch events for these handles.
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
// Tags the gw rejected up front — complete with Bad status now so the
// wait below doesn't time out on them.
if (itemHandle <= 0
&& pendingByRef.TryGetValue(fullRef, out var rejectedTcs))
{
rejectedTcs.TrySetResult(new DataValueSnapshot(
Value: null,
StatusCode: 0x80000000u, // Bad
SourceTimestampUtc: null,
ServerTimestampUtc: DateTime.UtcNow));
}
}
_subscriptions.Register(subscriptionId, bindings);
try
{
// Wait for every pending TCS to complete or the caller's CT to fire. When the
// CT fires before all values arrive, fill the still-pending entries with a
// Bad-status snapshot rather than throwing — Read semantics let callers see
// partial results.
using var registration = cancellationToken.Register(() =>
{
foreach (var tcs in pendingByRef.Values)
{
tcs.TrySetResult(new DataValueSnapshot(
Value: null,
StatusCode: 0x800B0000u, // BadTimeout
SourceTimestampUtc: null,
ServerTimestampUtc: DateTime.UtcNow));
}
});
var snapshots = new DataValueSnapshot[fullReferences.Count];
for (var i = 0; i < fullReferences.Count; i++)
{
snapshots[i] = await pendingByRef[fullReferences[i]].Task.ConfigureAwait(false);
}
return snapshots;
}
finally
{
pump.OnDataChange -= handler;
// Drop the bindings + unsubscribe the live handles. UnsubscribeBulkAsync's
// failure isn't fatal — the registry is already cleared, so any straggling
// event from the gw would be a no-op fan-out.
_subscriptions.Remove(subscriptionId);
var liveHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
if (liveHandles.Length > 0)
{
try
{
await _subscriber!.UnsubscribeBulkAsync(liveHandles, CancellationToken.None)
.ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"GalaxyDriver.ReadViaSubscribeOnceAsync UnsubscribeBulk failed for {Count} handle(s) — registry already cleared.",
liveHandles.Length);
}
}
}
}
// ===== IWritable (PR 4.3) =====
/// <inheritdoc />
public Task<IReadOnlyList<WriteResult>> WriteAsync(
IReadOnlyList<WriteRequest> writes, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(writes);
if (writes.Count == 0) return Task.FromResult<IReadOnlyList<WriteResult>>([]);
if (_dataWriter is null)
{
// Mirror the IReadable fallback: production write path runs on top of
// GalaxyMxSession (PR 4.2 skeleton; PR 4.4 wires the live session). Until
// that lands, deployments selecting Galaxy:Backend=mxgateway can't write.
throw new NotSupportedException(
"GalaxyDriver.WriteAsync requires GatewayGalaxyDataWriter wired against a connected " +
"GalaxyMxSession (PR 4.4). Until that lands, route writes through the legacy-host " +
"backend (Galaxy:Backend=legacy-host).");
}
return _dataWriter.WriteAsync(writes, ResolveSecurity, cancellationToken);
}
// ===== ISubscribable (PR 4.4) =====
/// <inheritdoc />
public async Task<ISubscriptionHandle> SubscribeAsync(
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(fullReferences);
if (_subscriber is null)
{
throw new NotSupportedException(
"GalaxyDriver.SubscribeAsync requires a connected GalaxyMxSession + GatewayGalaxySubscriber. " +
"PR 4.W wires the production session; until then route subscriptions through the legacy-host backend.");
}
var pump = EnsureEventPumpStarted();
var subscriptionId = _subscriptions.NextSubscriptionId();
if (fullReferences.Count == 0)
{
// Empty subscriptions register but never bind anything — keeps Unsubscribe
// symmetric for callers that conditionally add tags later.
_subscriptions.Register(subscriptionId, []);
return new GalaxySubscriptionHandle(subscriptionId);
}
// PR 6.3 — when the caller doesn't set a publishing interval (TimeSpan.Zero or
// negative), fall back to the configured MxAccess.PublishingIntervalMs. The
// server's UA subscription publishingInterval drives this in production; tests
// and infrastructure callers (probe watcher, deploy watcher) hit the fallback.
var requested = (int)Math.Max(0, publishingInterval.TotalMilliseconds);
var bufferedIntervalMs = requested > 0 ? requested : _options.MxAccess.PublishingIntervalMs;
var results = await _subscriber
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
.ConfigureAwait(false);
// Build the binding list in input order. Failed entries (gw rejected the tag) are
// recorded with a non-positive ItemHandle so the caller can detect partial failure
// by inspecting the returned handle's diagnostic context — full per-tag error
// surface lands in PR 5.3's parity tests.
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
if (match is null || !match.WasSuccessful)
{
_logger.LogWarning(
"Galaxy subscribe for {FullRef} failed: {Error}",
fullRef, match?.ErrorMessage ?? "<no result returned>");
}
}
_subscriptions.Register(subscriptionId, bindings);
_ = pump; // keep the pump alive for the subscription's lifetime
return new GalaxySubscriptionHandle(subscriptionId);
}
/// <inheritdoc />
public async Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(handle);
if (handle is not GalaxySubscriptionHandle gsh)
{
throw new ArgumentException(
$"Subscription handle was not issued by this driver (expected GalaxySubscriptionHandle, got {handle.GetType().Name}).",
nameof(handle));
}
var bindings = _subscriptions.Remove(gsh.SubscriptionId);
if (bindings is null) return; // already removed or never registered
var liveItemHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
if (liveItemHandles.Length == 0 || _subscriber is null) return;
try
{
await _subscriber.UnsubscribeBulkAsync(liveItemHandles, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy UnsubscribeBulk failed for subscription {SubscriptionId} — registry already cleared on driver side.",
gsh.SubscriptionId);
}
}
/// <summary>
/// Lazily start the <see cref="EventPump"/> on the first subscribe. The pump is
/// shared across every subscription on this driver — fan-out happens through the
/// <see cref="SubscriptionRegistry"/> reverse map, not by spinning a pump per
/// subscription.
/// </summary>
private EventPump EnsureEventPumpStarted()
{
lock (_pumpLock)
{
if (_eventPump is not null) return _eventPump;
_eventPump = new EventPump(
_subscriber!, _subscriptions, _logger,
channelCapacity: _options.MxAccess.EventPumpChannelCapacity,
clientName: _options.MxAccess.ClientName);
_eventPump.OnDataChange += OnPumpDataChange;
_eventPump.OnAlarmTransition += OnPumpAlarmTransition;
_eventPump.Start();
return _eventPump;
}
}
// ===== IAlarmSource (PR B.2) =====
/// <inheritdoc />
public Task<IAlarmSubscriptionHandle> SubscribeAlarmsAsync(
IReadOnlyList<string> sourceNodeIds, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(sourceNodeIds);
// The driver doesn't multiplex alarm subscriptions per source-node-id today —
// alarm events arrive on the same gateway StreamEvents channel as data-change
// events once the gateway emits the new family (PRs A.2 + A.3). The
// subscription handle is a sentinel the server uses for symmetric Unsubscribe;
// every active handle receives every alarm transition, and the server filters
// by source node before raising Part 9 conditions. Same shape AbCip uses.
EnsureEventPumpStarted();
var handle = new GalaxyAlarmSubscriptionHandle(Guid.NewGuid().ToString("N"));
lock (_alarmHandlersLock)
{
_alarmSubscriptions.Add(handle);
}
return Task.FromResult<IAlarmSubscriptionHandle>(handle);
}
/// <inheritdoc />
public Task UnsubscribeAlarmsAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(handle);
if (handle is not GalaxyAlarmSubscriptionHandle gash)
{
throw new ArgumentException(
$"Subscription handle was not issued by this driver (expected GalaxyAlarmSubscriptionHandle, got {handle.GetType().Name}).",
nameof(handle));
}
lock (_alarmHandlersLock)
{
_alarmSubscriptions.Remove(gash);
}
return Task.CompletedTask;
}
/// <inheritdoc />
public async Task AcknowledgeAsync(
IReadOnlyList<AlarmAcknowledgeRequest> acknowledgements, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(acknowledgements);
if (acknowledgements.Count == 0) return;
if (_alarmAcknowledger is null)
{
throw new NotSupportedException(
"GalaxyDriver.AcknowledgeAsync requires GatewayGalaxyAlarmAcknowledger wired against a connected " +
"GalaxyMxSession (PR B.2). InitializeAsync must run before alarm acknowledgements can flow.");
}
// Acks are issued one-by-one — the gateway RPC accepts a single alarm
// reference per call. AlarmConditionState's per-condition Acknowledge in the
// server-side ACL layer is the natural rate-limit, so issuing in series here
// keeps the operator-comment ordering deterministic without bursting the
// worker's STA queue.
foreach (var ack in acknowledgements)
{
// ConditionId carries the alarm full reference for the Galaxy driver —
// SourceNodeId is the OPC UA browse path, which the gateway can't address.
// The server-side condition state pairs them through AlarmConditionService.
var alarmFullReference = !string.IsNullOrEmpty(ack.ConditionId)
? ack.ConditionId
: ack.SourceNodeId;
await _alarmAcknowledger.AcknowledgeAsync(
alarmFullReference,
ack.Comment ?? string.Empty,
operatorUser: string.Empty, // server-side ACL fills this from the OPC UA session
cancellationToken).ConfigureAwait(false);
}
}
/// <summary>
/// Receives <see cref="GalaxyAlarmTransition"/> events from the EventPump and
/// reshapes them into <see cref="AlarmEventArgs"/> for OPC UA-side consumers.
/// Fires <see cref="OnAlarmEvent"/> only when at least one alarm subscription is
/// active so a server that hasn't called <see cref="SubscribeAlarmsAsync"/> yet
/// doesn't surface untracked transitions.
/// </summary>
private void OnPumpAlarmTransition(object? sender, GalaxyAlarmTransition transition)
{
GalaxyAlarmSubscriptionHandle? handle;
lock (_alarmHandlersLock)
{
// Pick any active subscription handle as the "owner" of the event. The
// server-side state machine doesn't multiplex by handle today; if multiple
// alarm subscriptions are active we still only fire the event once and
// the AlarmConditionService dispatches per-source-node downstream.
handle = _alarmSubscriptions.Count > 0
? _alarmSubscriptions.First()
: null;
}
if (handle is null) return;
var args = new AlarmEventArgs(
SubscriptionHandle: handle,
SourceNodeId: transition.SourceObjectReference,
ConditionId: transition.AlarmFullReference,
AlarmType: transition.AlarmTypeName,
Message: transition.Description,
Severity: transition.SeverityBucket,
SourceTimestampUtc: transition.TransitionTimestampUtc,
OperatorComment: string.IsNullOrEmpty(transition.OperatorComment) ? null : transition.OperatorComment,
OriginalRaiseTimestampUtc: transition.OriginalRaiseTimestampUtc,
AlarmCategory: string.IsNullOrEmpty(transition.Category) ? null : transition.Category);
try
{
OnAlarmEvent?.Invoke(this, args);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"GalaxyDriver OnAlarmEvent handler threw for {AlarmRef} — continuing.",
transition.AlarmFullReference);
}
}
/// <summary>
/// Forwards every fan-out event to the public <see cref="OnDataChange"/> for
/// ISubscribable consumers, AND routes ScanState changes to the per-platform
/// probe watcher (PR 4.7) so platform health entries update without the watcher
/// consuming the event stream itself.
/// </summary>
private void OnPumpDataChange(object? sender, DataChangeEventArgs args)
{
OnDataChange?.Invoke(this, args);
if (_probeWatcher is not null
&& args.FullReference.EndsWith(PerPlatformProbeWatcher.ProbeSuffix, StringComparison.OrdinalIgnoreCase))
{
// The probe decoder takes a raw quality byte; recover it from the StatusCode
// top byte (Good=0x00 → byte 192, Uncertain=0x40 → byte 64, Bad=0x80 → byte 0).
var qualityByte = (byte)((args.Snapshot.StatusCode >> 30) & 0x3) switch
{
0 => 192,
1 => 64,
_ => 0,
};
_probeWatcher.OnProbeValueChanged(args.FullReference, args.Snapshot.Value, (byte)qualityByte);
}
}
/// <summary>
/// Lazily builds the default <see cref="IGalaxyHierarchySource"/> from
/// <c>_options.Gateway</c>. Owned <see cref="GalaxyRepositoryClient"/> is disposed in
/// <see cref="Dispose"/>. Tests bypass this by injecting their own source via the
/// internal ctor.
/// </summary>
private IGalaxyHierarchySource BuildDefaultHierarchySource()
{
var gw = _options.Gateway;
var clientOptions = new MxGatewayClientOptions
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = ResolveApiKey(gw.ApiKeySecretRef),
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
DefaultCallTimeout = TimeSpan.FromSeconds(gw.DefaultCallTimeoutSeconds),
StreamTimeout = gw.StreamTimeoutSeconds > 0
? TimeSpan.FromSeconds(gw.StreamTimeoutSeconds)
: null,
};
_ownedRepositoryClient = GalaxyRepositoryClient.Create(clientOptions);
return new TracedGalaxyHierarchySource(
new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName);
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Order: stop deploy watcher, supervisor, probe watcher, pump, then sessions and
// clients. Each step is best-effort — disposal during a faulted state shouldn't
// throw and prevent the rest of the cleanup.
try { _deployWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "DeployWatcher dispose failed"); }
try { _supervisor?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ReconnectSupervisor dispose failed"); }
try { _probeWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ProbeWatcher dispose failed"); }
try { _transportForwarder?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "Transport forwarder dispose failed"); }
EventPump? pump;
lock (_pumpLock) { pump = _eventPump; _eventPump = null; }
pump?.DisposeAsync().AsTask().GetAwaiter().GetResult();
_ownedMxSession?.DisposeAsync().AsTask().GetAwaiter().GetResult();
_ownedMxSession = null;
_ownedMxClient?.DisposeAsync().AsTask().GetAwaiter().GetResult();
_ownedMxClient = null;
_ownedRepositoryClient?.DisposeAsync().AsTask().GetAwaiter().GetResult();
_ownedRepositoryClient = null;
_hierarchySource = null;
}
/// <summary>
/// Address-space builder wrapper that records each variable's
/// <see cref="DriverAttributeInfo.SecurityClass"/> into the supplied dictionary
/// before delegating to the inner builder. Used by <see cref="DiscoverAsync"/>
/// to capture per-tag classifications for the IWritable routing decision —
/// PR 4.3 needs the data, but the discoverer itself doesn't (and shouldn't)
/// know about the driver's internal state.
/// </summary>
private sealed class SecurityCapturingBuilder(
IAddressSpaceBuilder inner,
System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification> map)
: IAddressSpaceBuilder
{
public IAddressSpaceBuilder Folder(string browseName, string displayName)
=> new SecurityCapturingBuilder(inner.Folder(browseName, displayName), map);
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
{
map[attributeInfo.FullName] = attributeInfo.SecurityClass;
return inner.Variable(browseName, displayName, attributeInfo);
}
public void AddProperty(string browseName, DriverDataType dataType, object? value)
=> inner.AddProperty(browseName, dataType, value);
}
}

View File

@@ -0,0 +1,123 @@
using System.Text.Json;
using System.Text.Json.Serialization;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// Static factory registration helper for <see cref="GalaxyDriver"/>. Mirrors
/// <c>GalaxyProxyDriverFactoryExtensions</c> / <c>ModbusDriverFactoryExtensions</c>.
/// Server's <c>Program.cs</c> calls <see cref="Register"/> once at startup; the driver
/// bootstrap pipeline materialises DriverInstance rows whose <c>DriverType</c> matches
/// <see cref="DriverTypeName"/> into live <see cref="GalaxyDriver"/> instances.
/// </summary>
/// <remarks>
/// The driver-type name <c>"GalaxyMxGateway"</c> is intentionally distinct from the
/// legacy proxy's <c>"Galaxy"</c> so both factories can be registered simultaneously
/// during parity testing (Phase 5). PR 4.W will add a server-side <c>Galaxy:Backend</c>
/// switch that materialises a Galaxy DriverInstance under one or the other type name.
/// </remarks>
public static class GalaxyDriverFactoryExtensions
{
public const string DriverTypeName = "GalaxyMxGateway";
public static void Register(DriverFactoryRegistry registry, ILoggerFactory? loggerFactory = null)
{
ArgumentNullException.ThrowIfNull(registry);
registry.Register(DriverTypeName, (id, json) => CreateInstance(id, json, loggerFactory));
}
/// <summary>Convenience for tests + standalone callers.</summary>
public static GalaxyDriver CreateInstance(string driverInstanceId, string driverConfigJson)
=> CreateInstance(driverInstanceId, driverConfigJson, loggerFactory: null);
public static GalaxyDriver CreateInstance(
string driverInstanceId, string driverConfigJson, ILoggerFactory? loggerFactory)
{
ArgumentException.ThrowIfNullOrWhiteSpace(driverInstanceId);
ArgumentException.ThrowIfNullOrWhiteSpace(driverConfigJson);
var dto = JsonSerializer.Deserialize<GalaxyDriverConfigDto>(driverConfigJson, JsonOptions)
?? throw new InvalidOperationException(
$"Galaxy driver config for '{driverInstanceId}' deserialised to null");
var options = new GalaxyDriverOptions(
Gateway: new GalaxyGatewayOptions(
Endpoint: dto.Gateway?.Endpoint
?? throw new InvalidOperationException(
$"Galaxy driver '{driverInstanceId}' missing required Gateway.Endpoint"),
ApiKeySecretRef: dto.Gateway.ApiKeySecretRef
?? throw new InvalidOperationException(
$"Galaxy driver '{driverInstanceId}' missing required Gateway.ApiKeySecretRef"),
UseTls: dto.Gateway.UseTls ?? true,
CaCertificatePath: dto.Gateway.CaCertificatePath,
ConnectTimeoutSeconds: dto.Gateway.ConnectTimeoutSeconds ?? 10,
DefaultCallTimeoutSeconds: dto.Gateway.DefaultCallTimeoutSeconds ?? 30,
StreamTimeoutSeconds: dto.Gateway.StreamTimeoutSeconds ?? 0),
MxAccess: new GalaxyMxAccessOptions(
ClientName: dto.MxAccess?.ClientName
?? throw new InvalidOperationException(
$"Galaxy driver '{driverInstanceId}' missing required MxAccess.ClientName"),
PublishingIntervalMs: dto.MxAccess.PublishingIntervalMs ?? 1000,
WriteUserId: dto.MxAccess.WriteUserId ?? 0,
EventPumpChannelCapacity: dto.MxAccess.EventPumpChannelCapacity ?? 50_000),
Repository: new GalaxyRepositoryOptions(
DiscoverPageSize: dto.Repository?.DiscoverPageSize ?? 5000,
WatchDeployEvents: dto.Repository?.WatchDeployEvents ?? true),
Reconnect: new GalaxyReconnectOptions(
InitialBackoffMs: dto.Reconnect?.InitialBackoffMs ?? 500,
MaxBackoffMs: dto.Reconnect?.MaxBackoffMs ?? 30_000,
ReplayOnSessionLost: dto.Reconnect?.ReplayOnSessionLost ?? true));
return new GalaxyDriver(driverInstanceId, options, loggerFactory?.CreateLogger<GalaxyDriver>());
}
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
ReadCommentHandling = JsonCommentHandling.Skip,
AllowTrailingCommas = true,
};
internal sealed class GalaxyDriverConfigDto
{
public GatewayDto? Gateway { get; init; }
public MxAccessDto? MxAccess { get; init; }
public RepositoryDto? Repository { get; init; }
public ReconnectDto? Reconnect { get; init; }
}
internal sealed class GatewayDto
{
public string? Endpoint { get; init; }
public string? ApiKeySecretRef { get; init; }
public bool? UseTls { get; init; }
public string? CaCertificatePath { get; init; }
public int? ConnectTimeoutSeconds { get; init; }
public int? DefaultCallTimeoutSeconds { get; init; }
public int? StreamTimeoutSeconds { get; init; }
}
internal sealed class MxAccessDto
{
public string? ClientName { get; init; }
public int? PublishingIntervalMs { get; init; }
public int? WriteUserId { get; init; }
public int? EventPumpChannelCapacity { get; init; }
}
internal sealed class RepositoryDto
{
public int? DiscoverPageSize { get; init; }
public bool? WatchDeployEvents { get; init; }
}
internal sealed class ReconnectDto
{
public int? InitialBackoffMs { get; init; }
public int? MaxBackoffMs { get; init; }
public bool? ReplayOnSessionLost { get; init; }
}
}

View File

@@ -0,0 +1,58 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
/// <summary>
/// Pushes the synthetic top-level transport-health entry into the
/// <see cref="HostStatusAggregator"/>. Each driver instance has one entry under its
/// <c>MxAccess.ClientName</c> reflecting the gateway transport state — useful for
/// dashboards that want a single "Galaxy is up" signal independent of any individual
/// platform's ScanState.
/// </summary>
/// <remarks>
/// The eventual production source for this signal is the gateway's <c>StreamSessionHealth</c>
/// RPC (mxaccessgw issue gw-6). Until that ships, the driver-side reconnect supervisor
/// (PR 4.5) calls <see cref="SetTransport"/> on transport state transitions:
/// <see cref="HostState.Running"/> when the gw session re-Registers, <see cref="HostState.Stopped"/>
/// when the supervisor moves to <c>TransportLost</c>. The forwarder is intentionally
/// stateless beyond the cached client name + last-pushed value so the supervisor can
/// drive it without any back-pressure plumbing.
/// </remarks>
public sealed class HostConnectivityForwarder : IDisposable
{
private readonly string _clientName;
private readonly HostStatusAggregator _aggregator;
private readonly ILogger _logger;
private bool _disposed;
public HostConnectivityForwarder(string clientName, HostStatusAggregator aggregator, ILogger? logger = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clientName);
_clientName = clientName;
_aggregator = aggregator ?? throw new ArgumentNullException(nameof(aggregator));
_logger = logger ?? NullLogger.Instance;
}
/// <summary>
/// Push a transport state into the aggregator. Idempotent at the aggregator layer —
/// repeated calls with the same state don't fan out duplicate transitions.
/// </summary>
public void SetTransport(HostState state)
{
ObjectDisposedException.ThrowIf(_disposed, this);
var status = new HostConnectivityStatus(_clientName, state, DateTime.UtcNow);
_aggregator.Update(status);
_logger.LogDebug(
"GalaxyDriver transport state for {ClientName}: {State}",
_clientName, state);
}
public void Dispose()
{
// No-op today; reserved for the eventual gw-6 StreamSessionHealth consumer that
// will own a long-running task this method tears down.
_disposed = true;
}
}

View File

@@ -0,0 +1,98 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
/// <summary>
/// Pure-logic merger for the per-host connectivity entries that
/// <see cref="IHostConnectivityProbe"/> surfaces. Holds the current set of host
/// statuses (one synthetic top-level transport entry plus one entry per
/// <c>$WinPlatform</c>/<c>$AppEngine</c> probe) and emits
/// <see cref="OnHostStatusChanged"/> only when an upsert actually changes a host's
/// <see cref="HostState"/> — re-asserting the same state is a no-op so a stable
/// <c>ScanState=Running</c> burst doesn't fan out duplicate transitions.
/// </summary>
/// <remarks>
/// This class owns the de-dup + diff logic that lived in
/// <c>GalaxyProxyDriver.OnHostConnectivityUpdate</c> in v1. The watcher
/// (<see cref="PerPlatformProbeWatcher"/>) and the transport forwarder
/// (<see cref="HostConnectivityForwarder"/>) both feed this aggregator; the
/// <see cref="GalaxyDriver"/> consumes <see cref="Snapshot"/> from
/// <c>IHostConnectivityProbe.GetHostStatuses()</c> and re-raises
/// <see cref="OnHostStatusChanged"/> as the driver-level event (wired in PR 4.W).
/// </remarks>
public sealed class HostStatusAggregator
{
private readonly object _lock = new();
private readonly Dictionary<string, HostConnectivityStatus> _byHost =
new(StringComparer.OrdinalIgnoreCase);
/// <summary>
/// Fires when an <see cref="Update"/> call either introduces a new host or
/// transitions an existing host's <see cref="HostState"/>. Handlers run
/// outside the internal lock so they can safely re-enter the aggregator
/// (e.g. the driver re-broadcasting through <c>IHostConnectivityProbe</c>).
/// </summary>
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
/// <summary>
/// Snapshot the current host set. Suitable as the body of
/// <c>IHostConnectivityProbe.GetHostStatuses()</c>.
/// </summary>
public IReadOnlyList<HostConnectivityStatus> Snapshot()
{
lock (_lock)
{
return [.. _byHost.Values];
}
}
/// <summary>
/// Upsert the supplied status by <see cref="HostConnectivityStatus.HostName"/>.
/// Raises <see cref="OnHostStatusChanged"/> when the host is newly tracked
/// (previous state reported as <see cref="HostState.Unknown"/>) or when its
/// state value differs from the last cached entry. Re-asserting the same
/// state is silent.
/// </summary>
public void Update(HostConnectivityStatus status)
{
ArgumentNullException.ThrowIfNull(status);
HostState previous;
bool changed;
lock (_lock)
{
if (_byHost.TryGetValue(status.HostName, out var existing))
{
previous = existing.State;
changed = existing.State != status.State;
}
else
{
previous = HostState.Unknown;
changed = true;
}
_byHost[status.HostName] = status;
}
if (changed)
{
OnHostStatusChanged?.Invoke(this,
new HostStatusChangedEventArgs(status.HostName, previous, status.State));
}
}
/// <summary>
/// Drop a host entirely (e.g. after a redeploy removes a Platform). No event
/// is fired — observers only react to live transitions, not topology
/// reductions. Returns <c>true</c> when the host was tracked.
/// </summary>
public bool Remove(string hostName)
{
ArgumentException.ThrowIfNullOrWhiteSpace(hostName);
lock (_lock)
{
return _byHost.Remove(hostName);
}
}
}

View File

@@ -0,0 +1,200 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
/// <summary>
/// Subscribes the <c>ScanState</c> attribute of every <c>$WinPlatform</c> /
/// <c>$AppEngine</c> object the discoverer surfaced and translates ScanState
/// value-changes into per-host <see cref="HostConnectivityStatus"/> updates.
/// Ports the state machine in
/// <c>Driver.Galaxy.Host/Backend/Stability/GalaxyRuntimeProbeManager.cs</c> onto the
/// gateway subscription path.
/// </summary>
/// <remarks>
/// Address grammar: each platform tag's probe address is
/// <c>{platformTagName}.ScanState</c>. The watcher subscribes that address through
/// <see cref="IGalaxySubscriber"/>; the EventPump (PR 4.4) routes inbound
/// OnDataChange events back via <see cref="OnProbeValueChanged"/>. State decoding:
/// <list type="bullet">
/// <item>Quality &lt; <c>192</c> (Good) → <see cref="HostState.Unknown"/>.</item>
/// <item>Value <c>1</c>, <c>true</c>, or "Running" → <see cref="HostState.Running"/>.</item>
/// <item>Value <c>0</c>, <c>false</c>, or "Stopped" → <see cref="HostState.Stopped"/>.</item>
/// <item>Anything else with Good quality → <see cref="HostState.Faulted"/>.</item>
/// </list>
/// <see cref="SyncPlatformsAsync"/> is idempotent — call it after every
/// Discover / Rediscover. Newly-added platforms are subscribed; removed ones are
/// unsubscribed and dropped from the aggregator.
/// </remarks>
public sealed class PerPlatformProbeWatcher : IDisposable
{
public const string ProbeSuffix = ".ScanState";
private readonly IGalaxySubscriber _subscriber;
private readonly HostStatusAggregator _aggregator;
private readonly ILogger _logger;
private readonly int _bufferedUpdateIntervalMs;
// Tracked platform → gw item handle. Item handle 0 means the gw rejected the subscribe;
// we keep the entry so SyncPlatformsAsync doesn't try to subscribe it again on every call.
private readonly ConcurrentDictionary<string, int> _itemHandlesByPlatform =
new(StringComparer.OrdinalIgnoreCase);
private readonly Lock _syncLock = new();
private bool _disposed;
public PerPlatformProbeWatcher(
IGalaxySubscriber subscriber,
HostStatusAggregator aggregator,
ILogger? logger = null,
int bufferedUpdateIntervalMs = 0)
{
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
_aggregator = aggregator ?? throw new ArgumentNullException(nameof(aggregator));
_logger = logger ?? NullLogger.Instance;
if (bufferedUpdateIntervalMs < 0)
{
throw new ArgumentOutOfRangeException(nameof(bufferedUpdateIntervalMs),
"bufferedUpdateIntervalMs must be >= 0; 0 means use the gw's default cadence.");
}
_bufferedUpdateIntervalMs = bufferedUpdateIntervalMs;
}
/// <summary>Snapshot of platform tag names currently watched.</summary>
public IReadOnlyCollection<string> WatchedPlatforms => [.. _itemHandlesByPlatform.Keys];
/// <summary>
/// Reconcile the watched platform set against <paramref name="platformTagNames"/>.
/// Subscribes new entries, unsubscribes dropped ones. Calling with the same set is
/// a no-op.
/// </summary>
public async Task SyncPlatformsAsync(
IEnumerable<string> platformTagNames, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(platformTagNames);
var desired = new HashSet<string>(platformTagNames, StringComparer.OrdinalIgnoreCase);
// Compute deltas under the lock so concurrent SyncPlatformsAsync calls don't
// race on the membership view.
List<string> toAdd;
List<(string Platform, int ItemHandle)> toRemove;
lock (_syncLock)
{
toAdd = [.. desired.Where(p => !_itemHandlesByPlatform.ContainsKey(p))];
toRemove = [.. _itemHandlesByPlatform
.Where(kvp => !desired.Contains(kvp.Key) && kvp.Value > 0)
.Select(kvp => (kvp.Key, kvp.Value))];
// Drop removed entries from the membership map up-front so a concurrent
// OnProbeValueChanged for them is silently ignored. The unsubscribe RPC
// runs outside the lock.
foreach (var (platform, _) in toRemove)
{
_itemHandlesByPlatform.TryRemove(platform, out _);
_aggregator.Remove(platform);
}
}
if (toRemove.Count > 0)
{
try
{
await _subscriber.UnsubscribeBulkAsync(
[.. toRemove.Select(t => t.ItemHandle)], cancellationToken).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"PerPlatformProbeWatcher unsubscribe failed for {Count} probe(s); aggregator entries already cleared.",
toRemove.Count);
}
}
if (toAdd.Count == 0) return;
var probeAddresses = toAdd.Select(p => p + ProbeSuffix).ToArray();
// PR 6.3 — use the configured bufferedUpdateIntervalMs (defaults to 0 = gw cadence
// when the driver hasn't overridden MxAccess.PublishingIntervalMs). Probe ScanState
// changes are rare so a coarser interval is usually fine; deployments that need
// tighter health visibility can dial it down through GalaxyDriverOptions.
var results = await _subscriber.SubscribeBulkAsync(
probeAddresses, _bufferedUpdateIntervalMs, cancellationToken).ConfigureAwait(false);
for (var i = 0; i < toAdd.Count; i++)
{
var platform = toAdd[i];
var match = results.FirstOrDefault(r => string.Equals(
r.TagAddress, probeAddresses[i], StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
_itemHandlesByPlatform[platform] = itemHandle;
if (itemHandle <= 0)
{
_logger.LogWarning(
"PerPlatformProbeWatcher subscribe failed for {Platform}: {Error}",
platform, match?.ErrorMessage ?? "<no result returned>");
}
}
}
/// <summary>
/// Route an OnDataChange for a probe address into the aggregator. The EventPump
/// (PR 4.4) calls this; tests can drive it directly to exercise the state machine
/// without spinning a real gw. Foreign references (anything not ending in
/// <see cref="ProbeSuffix"/>, or a probe for a platform we're not tracking) are
/// silently dropped.
/// </summary>
public void OnProbeValueChanged(string fullReference, object? value, byte qualityByte)
{
if (_disposed) return;
ArgumentNullException.ThrowIfNull(fullReference);
if (!fullReference.EndsWith(ProbeSuffix, StringComparison.OrdinalIgnoreCase)) return;
var platform = fullReference[..^ProbeSuffix.Length];
if (!_itemHandlesByPlatform.ContainsKey(platform)) return;
var state = DecodeState(value, qualityByte);
_aggregator.Update(new HostConnectivityStatus(platform, state, DateTime.UtcNow));
}
/// <summary>
/// Decode a ScanState value + raw quality byte to a <see cref="HostState"/>.
/// Public for tests that want to pin the decoding table.
/// </summary>
public static HostState DecodeState(object? value, byte qualityByte)
{
if (qualityByte < 192) return HostState.Unknown;
return value switch
{
bool b => b ? HostState.Running : HostState.Stopped,
int i => i == 1 ? HostState.Running : i == 0 ? HostState.Stopped : HostState.Faulted,
short s => s == 1 ? HostState.Running : s == 0 ? HostState.Stopped : HostState.Faulted,
long l => l == 1 ? HostState.Running : l == 0 ? HostState.Stopped : HostState.Faulted,
string str when string.Equals(str, "Running", StringComparison.OrdinalIgnoreCase) => HostState.Running,
string str when string.Equals(str, "Stopped", StringComparison.OrdinalIgnoreCase) => HostState.Stopped,
_ => HostState.Faulted,
};
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
// Best-effort unsubscribe everything we know about. Run synchronously through
// GetAwaiter().GetResult() since Dispose is sync; transport errors are swallowed.
var liveHandles = _itemHandlesByPlatform.Values.Where(h => h > 0).ToArray();
_itemHandlesByPlatform.Clear();
if (liveHandles.Length > 0)
{
try { _subscriber.UnsubscribeBulkAsync(liveHandles, CancellationToken.None).GetAwaiter().GetResult(); }
catch (Exception ex) { _logger.LogWarning(ex, "PerPlatformProbeWatcher dispose unsubscribe failed"); }
}
}
}

View File

@@ -0,0 +1,313 @@
using System.Diagnostics.Metrics;
using System.Threading.Channels;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Contracts.Proto;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Long-running consumer of <see cref="IGalaxySubscriber.StreamEventsAsync"/>. Translates
/// each <see cref="MxEvent"/> with family <see cref="MxEventFamily.OnDataChange"/> into
/// <see cref="DataChangeEventArgs"/> and dispatches one event per registered driver
/// subscription that includes the changed item handle (fan-out via
/// <see cref="SubscriptionRegistry.ResolveSubscribers"/>).
/// </summary>
/// <remarks>
/// <para>
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in PR 4.5's
/// supervisor; on transport failure here we log + propagate so the supervisor can
/// decide whether to restart.
/// </para>
/// <para>
/// PR 6.2 — the network-read loop and the listener-fanout loop are decoupled by a
/// bounded <see cref="Channel{T}"/>. When a listener is slow enough to fill the
/// channel, new events are dropped (newest-dropped semantics: producer's
/// <c>TryWrite</c> fails) rather than back-pressuring the gw stream. Three counters
/// on the <c>ZB.MOM.WW.OtOpcUa.Driver.Galaxy</c> meter expose received / dispatched
/// / dropped totals so ops sees pressure before it manifests as user-visible loss.
/// </para>
/// </remarks>
internal sealed class EventPump : IAsyncDisposable
{
public const string MeterName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
private const int DefaultChannelCapacity = 50_000;
// Single static meter so a host-level MeterListener catches all pump instances.
private static readonly Meter Meter = new(MeterName);
private static readonly Counter<long> EventsReceived =
Meter.CreateCounter<long>("galaxy.events.received", unit: "{event}",
description: "MxEvents read from the gateway StreamEvents stream.");
private static readonly Counter<long> EventsDispatched =
Meter.CreateCounter<long>("galaxy.events.dispatched", unit: "{event}",
description: "MxEvents passed through the bounded channel and into OnDataChange.");
private static readonly Counter<long> EventsDropped =
Meter.CreateCounter<long>("galaxy.events.dropped", unit: "{event}",
description: "MxEvents dropped because the bounded channel was full (newest-dropped).");
private static readonly Counter<long> AlarmTransitionsReceived =
Meter.CreateCounter<long>("galaxy.alarm_transitions.received", unit: "{event}",
description: "OnAlarmTransition events decoded and forwarded to driver-level handlers.");
private static readonly Counter<long> AlarmTransitionsDecodingFailures =
Meter.CreateCounter<long>("galaxy.alarm_transitions.decoding_failures", unit: "{event}",
description: "OnAlarmTransition events that arrived without a populated body or with an unspecified transition kind.");
private readonly IGalaxySubscriber _subscriber;
private readonly SubscriptionRegistry _registry;
private readonly ILogger _logger;
private readonly Func<long, ISubscriptionHandle> _handleFactory;
private readonly Channel<MxEvent> _channel;
private readonly KeyValuePair<string, object?> _clientTag;
private readonly CancellationTokenSource _cts = new();
private Task? _loop;
private Task? _dispatchLoop;
private bool _disposed;
public event EventHandler<DataChangeEventArgs>? OnDataChange;
/// <summary>
/// Fires for every <see cref="MxEventFamily.OnAlarmTransition"/> event the
/// gateway forwards. Decoded into a <see cref="GalaxyAlarmTransition"/> with
/// the OPC UA severity bucket already mapped via
/// <see cref="MxAccessSeverityMapper"/>. The driver wraps this onto
/// <c>IAlarmSource.OnAlarmEvent</c> in PR B.2.
/// </summary>
internal event EventHandler<GalaxyAlarmTransition>? OnAlarmTransition;
public EventPump(
IGalaxySubscriber subscriber,
SubscriptionRegistry registry,
ILogger? logger = null,
Func<long, ISubscriptionHandle>? handleFactory = null,
int channelCapacity = DefaultChannelCapacity,
string? clientName = null)
{
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_logger = logger ?? NullLogger.Instance;
_handleFactory = handleFactory ?? (id => new GalaxySubscriptionHandle(id));
if (channelCapacity < 1)
{
throw new ArgumentOutOfRangeException(nameof(channelCapacity),
"channelCapacity must be >= 1; recommended 50_000 for 50k-tag deployments.");
}
_channel = Channel.CreateBounded<MxEvent>(new BoundedChannelOptions(channelCapacity)
{
// Newest-dropped policy: when full, the producer's TryWrite returns false
// and we account for the drop. We do this manually rather than relying on
// BoundedChannelFullMode.DropWrite so we can count drops without polling.
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = true,
});
_clientTag = new KeyValuePair<string, object?>("galaxy.client", clientName ?? "<unknown>");
}
/// <summary>
/// Start consuming the event stream on a background task. Idempotent — second
/// calls are no-ops while the loop is running.
/// </summary>
public void Start()
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_loop is not null) return;
_loop = Task.Run(() => RunAsync(_cts.Token));
_dispatchLoop = Task.Run(() => DispatchLoopAsync(_cts.Token));
}
private async Task RunAsync(CancellationToken ct)
{
try
{
await foreach (var ev in _subscriber.StreamEventsAsync(ct).WithCancellation(ct).ConfigureAwait(false))
{
if (ct.IsCancellationRequested) break;
EventsReceived.Add(1, _clientTag);
// Newest-dropped: TryWrite fast-paths the common case (channel has room).
// When full we count the drop and continue reading the gw stream so
// back-pressure doesn't propagate upstream.
if (!_channel.Writer.TryWrite(ev))
{
EventsDropped.Add(1, _clientTag);
}
}
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Clean shutdown — no log.
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy EventPump loop ended with an exception — reconnect supervisor (PR 4.5) handles restart.");
}
finally
{
// Tell the dispatch loop the producer is done so it drains and exits.
_channel.Writer.TryComplete();
}
}
private async Task DispatchLoopAsync(CancellationToken ct)
{
try
{
await foreach (var ev in _channel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
{
Dispatch(ev);
EventsDispatched.Add(1, _clientTag);
}
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Clean shutdown.
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy EventPump dispatch loop ended with an exception — events past this point will be lost until restart.");
}
}
private void Dispatch(MxEvent ev)
{
switch (ev.Family)
{
case MxEventFamily.OnDataChange:
DispatchDataChange(ev);
break;
case MxEventFamily.OnAlarmTransition:
DispatchAlarmTransition(ev);
break;
default:
// OnWriteComplete / OperationComplete / OnBufferedDataChange are filtered
// out — write callers get their reply via the InvokeAsync round-trip, not
// via the event stream.
return;
}
}
private void DispatchDataChange(MxEvent ev)
{
var subscribers = _registry.ResolveSubscribers(ev.ItemHandle);
if (subscribers.Count == 0) return; // stale event after unsubscribe — drop quietly
var snapshot = ToSnapshot(ev);
foreach (var (subscriptionId, fullReference) in subscribers)
{
var handle = _handleFactory(subscriptionId);
try
{
OnDataChange?.Invoke(this, new DataChangeEventArgs(handle, fullReference, snapshot));
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy OnDataChange handler threw for {FullRef} subscription {SubscriptionId} — continuing fan-out.",
fullReference, subscriptionId);
}
}
}
private void DispatchAlarmTransition(MxEvent ev)
{
// Body absent (e.g. malformed gateway event or worker version skew) — count and
// drop. The Part 9 sub-attribute fallback path keeps an alarm functional even
// when the rich payload disappears.
if (ev.OnAlarmTransition is not { } body)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy OnAlarmTransition event arrived without a populated body (sequence={Sequence}); ignoring.",
ev.WorkerSequence);
return;
}
if (body.TransitionKind == AlarmTransitionKind.Unspecified)
{
AlarmTransitionsDecodingFailures.Add(1, _clientTag);
_logger.LogDebug(
"Galaxy OnAlarmTransition for {AlarmRef} has unspecified transition kind; ignoring.",
body.AlarmFullReference);
return;
}
var (bucket, opcUaSeverity) = MxAccessSeverityMapper.Map(body.Severity);
var transitionTimestamp = body.TransitionTimestamp is { } tts
? tts.ToDateTime()
: DateTime.UtcNow;
DateTime? originalRaiseTimestamp = body.OriginalRaiseTimestamp is { } orts
? orts.ToDateTime()
: null;
var transition = new GalaxyAlarmTransition(
AlarmFullReference: body.AlarmFullReference,
SourceObjectReference: body.SourceObjectReference,
AlarmTypeName: body.AlarmTypeName,
TransitionKind: MapTransitionKind(body.TransitionKind),
SeverityBucket: bucket,
OpcUaSeverity: opcUaSeverity,
RawMxAccessSeverity: body.Severity,
OriginalRaiseTimestampUtc: originalRaiseTimestamp,
TransitionTimestampUtc: transitionTimestamp,
OperatorUser: body.OperatorUser,
OperatorComment: body.OperatorComment,
Category: body.Category,
Description: body.Description);
AlarmTransitionsReceived.Add(1, _clientTag);
try
{
OnAlarmTransition?.Invoke(this, transition);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy OnAlarmTransition handler threw for {AlarmRef} — continuing.",
transition.AlarmFullReference);
}
}
private static GalaxyAlarmTransitionKind MapTransitionKind(AlarmTransitionKind kind) => kind switch
{
AlarmTransitionKind.Raise => GalaxyAlarmTransitionKind.Raise,
AlarmTransitionKind.Acknowledge => GalaxyAlarmTransitionKind.Acknowledge,
AlarmTransitionKind.Clear => GalaxyAlarmTransitionKind.Clear,
AlarmTransitionKind.Retrigger => GalaxyAlarmTransitionKind.Retrigger,
_ => GalaxyAlarmTransitionKind.Unspecified,
};
private DataValueSnapshot ToSnapshot(MxEvent ev)
{
var value = MxValueDecoder.Decode(ev.Value);
var statusCode = ev.Statuses.Count > 0
? StatusCodeMap.FromMxStatus(ev.Statuses[0], _logger)
: StatusCodeMap.FromQualityByte((byte)(ev.Quality & 0xFF), _logger);
DateTime? sourceTimestamp = ev.SourceTimestamp is { } ts ? ts.ToDateTime() : null;
return new DataValueSnapshot(
Value: value,
StatusCode: statusCode,
SourceTimestampUtc: sourceTimestamp,
ServerTimestampUtc: DateTime.UtcNow);
}
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
_cts.Cancel();
_channel.Writer.TryComplete();
if (_loop is not null)
{
try { await _loop.ConfigureAwait(false); } catch { /* shutdown */ }
}
if (_dispatchLoop is not null)
{
try { await _dispatchLoop.ConfigureAwait(false); } catch { /* shutdown */ }
}
_cts.Dispose();
}
}

View File

@@ -0,0 +1,21 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side handle returned by <see cref="GalaxyDriver.SubscribeAlarmsAsync"/>.
/// The driver doesn't multiplex alarm transitions per handle — every active handle
/// observes the gateway's alarm-event stream — but the handle is needed for
/// symmetric Unsubscribe and for the server-side AlarmConditionService to
/// correlate transitions with the originating subscription.
/// </summary>
internal sealed class GalaxyAlarmSubscriptionHandle : IAlarmSubscriptionHandle
{
public GalaxyAlarmSubscriptionHandle(string diagnosticId)
{
DiagnosticId = diagnosticId;
}
/// <inheritdoc />
public string DiagnosticId { get; }
}

View File

@@ -0,0 +1,36 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Decoded MXAccess alarm transition surfaced by <see cref="EventPump"/>.
/// The driver wraps this into <see cref="AlarmEventArgs"/> on the
/// <see cref="IAlarmSource.OnAlarmEvent"/> path; the richer fields
/// (operator user/comment, original raise time, category) become available
/// on the OPC UA Part 9 condition once <c>AlarmEventArgs</c> is extended in
/// the client-surface refresh PR (E.7).
/// </summary>
internal sealed record GalaxyAlarmTransition(
string AlarmFullReference,
string SourceObjectReference,
string AlarmTypeName,
GalaxyAlarmTransitionKind TransitionKind,
AlarmSeverity SeverityBucket,
int OpcUaSeverity,
int RawMxAccessSeverity,
DateTime? OriginalRaiseTimestampUtc,
DateTime TransitionTimestampUtc,
string OperatorUser,
string OperatorComment,
string Category,
string Description);
/// <summary>Kind of alarm state change observed by <see cref="EventPump"/>.</summary>
internal enum GalaxyAlarmTransitionKind
{
Unspecified = 0,
Raise = 1,
Acknowledge = 2,
Clear = 3,
Retrigger = 4,
}

View File

@@ -0,0 +1,102 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Client;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side wrapper around the gateway's <see cref="MxGatewaySession"/>. Owns the
/// MXAccess <c>Register</c> handle, caches the per-tag item handles AddItem returns,
/// and coordinates the read / write / subscribe call paths. PRs 4.2-4.5 fill this in
/// incrementally:
/// <list type="bullet">
/// <item>PR 4.2 (this PR) — skeleton + lifecycle wiring.</item>
/// <item>PR 4.3 — write path.</item>
/// <item>PR 4.4 — subscription registry + event pump + the production
/// <see cref="IGalaxyDataReader"/> implementation that drives the read path.</item>
/// <item>PR 4.5 — reconnect supervisor.</item>
/// </list>
/// </summary>
public sealed class GalaxyMxSession : IAsyncDisposable
{
private readonly GalaxyMxAccessOptions _options;
private readonly ILogger _logger;
// Owned gateway client + session — populated when ConnectAsync runs. Tests can leave
// them null and exercise the surface via injected IGalaxyDataReader fakes.
private MxGatewayClient? _ownedClient;
private MxGatewaySession? _session;
private int _serverHandle;
private bool _disposed;
public GalaxyMxSession(GalaxyMxAccessOptions options, ILogger? logger = null)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? NullLogger.Instance;
}
public bool IsConnected => _session is not null;
/// <summary>
/// Server-side handle returned by MXAccess <c>Register</c>. Zero before
/// <see cref="ConnectAsync"/> opens the session.
/// </summary>
public int ServerHandle => _serverHandle;
/// <summary>
/// Connect the underlying gateway client + open an MXAccess session + register the
/// configured client name. Idempotent — second calls are no-ops while
/// <see cref="IsConnected"/> is true.
/// </summary>
public async Task ConnectAsync(MxGatewayClientOptions clientOptions, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (_session is not null) return;
_ownedClient = MxGatewayClient.Create(clientOptions);
_session = await _ownedClient.OpenSessionAsync(cancellationToken: cancellationToken).ConfigureAwait(false);
_serverHandle = await _session.RegisterAsync(_options.ClientName, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"GalaxyMxSession connected — clientName={ClientName} serverHandle={Handle}",
_options.ClientName, _serverHandle);
}
/// <summary>
/// Test seam — attach a session opened externally (e.g. against an in-process gw
/// fake). Skips the gateway-client construction so tests can drive the session
/// surface without spinning a real gRPC channel. Caller retains client ownership.
/// </summary>
internal void AttachForTests(MxGatewaySession session, int serverHandle)
{
ObjectDisposedException.ThrowIf(_disposed, this);
_session = session ?? throw new ArgumentNullException(nameof(session));
_serverHandle = serverHandle;
}
/// <summary>
/// Returns the underlying gateway session. Null until <see cref="ConnectAsync"/> or
/// <see cref="AttachForTests"/> runs. PR 4.3 / 4.4 use this to issue commands.
/// </summary>
public MxGatewaySession? Session => _session;
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
if (_session is not null)
{
try { await _session.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession session dispose failed (best-effort)"); }
}
_session = null;
if (_ownedClient is not null)
{
try { await _ownedClient.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "GalaxyMxSession client dispose failed (best-effort)"); }
}
_ownedClient = null;
}
}

View File

@@ -0,0 +1,12 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-internal subscription identity. The numeric id is allocated monotonically per
/// driver; the diagnostic string carries the same id prefixed for log cross-referencing.
/// </summary>
internal sealed record GalaxySubscriptionHandle(long SubscriptionId) : ISubscriptionHandle
{
public string DiagnosticId => $"galaxy-sub-{SubscriptionId}";
}

View File

@@ -0,0 +1,35 @@
using System.Diagnostics;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// PR 6.1 — In-box <see cref="ActivitySource"/> wired around every gw call the
/// driver makes (Subscribe/Unsubscribe, Write/WriteSecured, GetHierarchy). The
/// decorators in this folder produce one span per call, tagged with the inputs
/// ops needs to triage a slow or failing operation:
/// <c>galaxy.tag_count</c>, <c>galaxy.success_count</c>, <c>galaxy.client</c>.
/// <para>
/// The driver itself doesn't take a dependency on the OpenTelemetry packages —
/// <c>System.Diagnostics.ActivitySource</c> is in the BCL. The host process
/// decides which listener (OTLP exporter, Application Insights, dotnet-trace)
/// subscribes to <see cref="ActivitySourceName"/>.
/// </para>
/// </summary>
internal static class GalaxyTelemetry
{
public const string ActivitySourceName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
/// <summary>
/// Tag a span with a failure reason and set its status to <c>Error</c>. Helper
/// so the decorators don't repeat the four-line idiom on every catch block.
/// </summary>
public static void RecordError(this Activity? activity, Exception ex)
{
if (activity is null) return;
activity.SetStatus(ActivityStatusCode.Error, ex.Message);
activity.SetTag("exception.type", ex.GetType().FullName);
activity.SetTag("exception.message", ex.Message);
}
}

View File

@@ -0,0 +1,65 @@
using Microsoft.Extensions.Logging;
using MxGateway.Client;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Production <see cref="IGalaxyAlarmAcknowledger"/> backed by the
/// <c>MxGatewayClient.AcknowledgeAlarmAsync</c> RPC (PR E.2). Maps the
/// reply's protocol status into a thrown exception when the gateway
/// reports a non-OK condition; native MxStatus failures inside the reply
/// surface as a logged warning so operator workflows aren't blocked by a
/// transient MxAccess hiccup.
/// </summary>
internal sealed class GatewayGalaxyAlarmAcknowledger : IGalaxyAlarmAcknowledger
{
private readonly MxGatewayClient _client;
private readonly GalaxyMxSession _session;
private readonly ILogger _logger;
public GatewayGalaxyAlarmAcknowledger(
MxGatewayClient client,
GalaxyMxSession session,
ILogger logger)
{
_client = client ?? throw new ArgumentNullException(nameof(client));
_session = session ?? throw new ArgumentNullException(nameof(session));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task AcknowledgeAsync(
string alarmFullReference,
string comment,
string operatorUser,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrEmpty(alarmFullReference);
var session = _session.Session
?? throw new InvalidOperationException(
"GatewayGalaxyAlarmAcknowledger requires a connected GalaxyMxSession; underlying gateway session is null.");
var sessionId = session.SessionId;
var reply = await _client.AcknowledgeAlarmAsync(
new AcknowledgeAlarmRequest
{
SessionId = sessionId,
ClientCorrelationId = Guid.NewGuid().ToString("N"),
AlarmFullReference = alarmFullReference,
Comment = comment ?? string.Empty,
OperatorUser = operatorUser ?? string.Empty,
},
cancellationToken).ConfigureAwait(false);
if (reply.Status is { Success: 0 } status)
{
// Native MxAccess rejected the ack — log but don't throw. Treat as a
// best-effort operator workflow; the operator can retry via the OPC UA
// session if necessary.
_logger.LogWarning(
"Galaxy AcknowledgeAlarm for {AlarmRef} returned MxStatus failure: category={Category} detail={Detail} text={Text}",
alarmFullReference, status.Category, status.Detail, status.DiagnosticText);
}
}
}

View File

@@ -0,0 +1,162 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Client;
using MxGateway.Contracts.Proto;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Production <see cref="IGalaxyDataWriter"/> over <see cref="GalaxyMxSession"/>.
/// For each batch entry: lazy-AddItem to obtain the MXAccess item handle, encode
/// the value via <see cref="MxValueEncoder"/>, route through Write or WriteSecured
/// based on the per-tag <see cref="SecurityClassification"/>, and translate the
/// reply's <c>MxStatusProxy</c> into an OPC UA <see cref="WriteResult"/>.
/// </summary>
/// <remarks>
/// Item handle cache survives across writes — repeated writes to the same tag avoid
/// re-AddItem. Per-tag failures are isolated: one bad write doesn't fail the batch.
/// PR 4.4 will share this cache with the subscription registry; for now it lives
/// here so the writer is independently testable.
/// </remarks>
public sealed class GatewayGalaxyDataWriter : IGalaxyDataWriter
{
private readonly GalaxyMxSession _session;
private readonly int _writeUserId;
private readonly ILogger _logger;
private readonly ConcurrentDictionary<string, int> _itemHandles =
new(StringComparer.OrdinalIgnoreCase);
public GatewayGalaxyDataWriter(GalaxyMxSession session, int writeUserId, ILogger? logger = null)
{
_session = session ?? throw new ArgumentNullException(nameof(session));
_writeUserId = writeUserId;
_logger = logger ?? NullLogger.Instance;
}
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
IReadOnlyList<WriteRequest> writes,
Func<string, SecurityClassification> securityResolver,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(writes);
ArgumentNullException.ThrowIfNull(securityResolver);
var session = _session.Session
?? throw new InvalidOperationException(
"GalaxyMxSession is not connected. Call ConnectAsync before issuing writes.");
var serverHandle = _session.ServerHandle;
var results = new WriteResult[writes.Count];
for (var i = 0; i < writes.Count; i++)
{
results[i] = await WriteOneAsync(session, serverHandle, writes[i],
securityResolver(writes[i].FullReference), cancellationToken)
.ConfigureAwait(false);
}
return results;
}
private async Task<WriteResult> WriteOneAsync(
MxGatewaySession session, int serverHandle, WriteRequest request,
SecurityClassification classification, CancellationToken ct)
{
try
{
var itemHandle = await EnsureItemHandleAsync(session, serverHandle, request.FullReference, ct)
.ConfigureAwait(false);
var mxValue = MxValueEncoder.Encode(request.Value);
var reply = NeedsSecuredWrite(classification)
? await InvokeWriteSecuredAsync(session, serverHandle, itemHandle, mxValue, ct).ConfigureAwait(false)
: await session.WriteRawAsync(serverHandle, itemHandle, mxValue, _writeUserId, ct).ConfigureAwait(false);
return TranslateReply(reply, request.FullReference);
}
catch (ArgumentException ex)
{
// Bad value type — caller passed a CLR type the encoder can't render.
_logger.LogWarning(ex,
"GalaxyDriver write rejected — unsupported value type for {FullRef}", request.FullReference);
return new WriteResult(StatusCodeMap.BadInternalError);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested) { throw; }
catch (Exception ex)
{
_logger.LogWarning(ex, "GalaxyDriver write failed for {FullRef}", request.FullReference);
return new WriteResult(StatusCodeMap.BadCommunicationError);
}
}
private static bool NeedsSecuredWrite(SecurityClassification classification) =>
classification is SecurityClassification.SecuredWrite or SecurityClassification.VerifiedWrite;
private async Task<int> EnsureItemHandleAsync(
MxGatewaySession session, int serverHandle, string fullRef, CancellationToken ct)
{
if (_itemHandles.TryGetValue(fullRef, out var existing)) return existing;
var handle = await session.AddItemAsync(serverHandle, fullRef, ct).ConfigureAwait(false);
_itemHandles[fullRef] = handle;
return handle;
}
/// <summary>
/// Issue a WriteSecured command. The high-level session client doesn't expose
/// <c>WriteSecuredAsync</c> as a typed method — we build the <see cref="MxCommand"/>
/// directly and route through <c>InvokeAsync</c>. Verifier user is left at zero
/// for SecuredWrite; VerifiedWrite uses the same path because the gw's worker
/// interprets the underlying MXAccess command kind.
/// </summary>
private static Task<MxCommandReply> InvokeWriteSecuredAsync(
MxGatewaySession session, int serverHandle, int itemHandle, MxValue value, CancellationToken ct)
{
var command = new MxCommand
{
Kind = MxCommandKind.WriteSecured,
WriteSecured = new WriteSecuredCommand
{
ServerHandle = serverHandle,
ItemHandle = itemHandle,
Value = value,
CurrentUserId = 0,
VerifierUserId = 0,
},
};
var request = new MxCommandRequest
{
SessionId = session.SessionId,
ClientCorrelationId = Guid.NewGuid().ToString("N"),
Command = command,
};
return session.InvokeAsync(request, ct);
}
/// <summary>
/// Translate a gateway <see cref="MxCommandReply"/> into an OPC UA
/// <see cref="WriteResult"/>. Honours the protocol-level Status field first
/// (transport / dispatch failures), then the first MXAccess status row.
/// </summary>
private WriteResult TranslateReply(MxCommandReply reply, string fullRef)
{
// Protocol status — wraps transport / worker-side failures that happen before
// MXAccess saw the command.
if (reply.ProtocolStatus is { } proto && proto.Code != ProtocolStatusCode.Ok)
{
_logger.LogWarning(
"GalaxyDriver write protocol failure {Code} for {FullRef}: {Message}",
proto.Code, fullRef, proto.Message);
return new WriteResult(StatusCodeMap.BadCommunicationError);
}
// MX-side status — the worker's WriteCompleteEvent rolls into the reply's
// statuses array. Use the first row (single-write contract).
if (reply.Statuses.Count > 0)
{
var status = reply.Statuses[0];
return new WriteResult(StatusCodeMap.FromMxStatus(status, _logger));
}
return new WriteResult(StatusCodeMap.Good);
}
}

View File

@@ -0,0 +1,116 @@
using MxGateway.Client;
using MxGateway.Contracts.Proto;
// Use the generated nested status enum for the SetBufferedUpdateInterval reply check.
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Production <see cref="IGalaxySubscriber"/> over a connected
/// <see cref="GalaxyMxSession"/>. Forwards SubscribeBulk / UnsubscribeBulk to the
/// gateway and streams MxEvents via the gw's bidirectional events RPC.
/// </summary>
/// <remarks>
/// PR 6.3 wired the per-call <c>buffered_update_interval_ms</c> through
/// <see cref="SubscribeBulkAsync"/>. The gw's contract is session-level
/// (<c>SetBufferedUpdateInterval</c> applies to all buffered subscriptions on the
/// server handle), so we cache the last-applied value and skip redundant calls.
/// </remarks>
public sealed class GatewayGalaxySubscriber : IGalaxySubscriber
{
private readonly GalaxyMxSession _session;
private readonly Lock _intervalLock = new();
private int _lastAppliedIntervalMs = -1; // -1 = never applied; 0 = explicit "use gw default"
public GatewayGalaxySubscriber(GalaxyMxSession session)
{
_session = session ?? throw new ArgumentNullException(nameof(session));
}
public async Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
{
var session = _session.Session
?? throw new InvalidOperationException(
"GalaxyMxSession is not connected. Call ConnectAsync before subscribing.");
var serverHandle = _session.ServerHandle;
// The gw's SubscribeBulk RPC doesn't carry a per-call interval — buffered cadence
// is session-level, set via SetBufferedUpdateInterval. Apply it before the
// SubscribeBulk so the very first events on the new handles publish at the
// requested cadence. Skip when the last-applied value already matches.
if (bufferedUpdateIntervalMs > 0)
{
await EnsureSessionIntervalAsync(session, serverHandle, bufferedUpdateIntervalMs, cancellationToken)
.ConfigureAwait(false);
}
return await session.SubscribeBulkAsync(serverHandle, fullReferences, cancellationToken)
.ConfigureAwait(false);
}
/// <summary>
/// Apply the gateway's session-level <c>SetBufferedUpdateInterval</c> command. The
/// gw's contract is "for this server handle, every buffered subscription publishes
/// at this cadence" — there's no per-handle granularity, so we cache the last
/// applied value and skip redundant calls.
/// </summary>
private async Task EnsureSessionIntervalAsync(
MxGateway.Client.MxGatewaySession session, int serverHandle, int intervalMs, CancellationToken cancellationToken)
{
lock (_intervalLock)
{
if (_lastAppliedIntervalMs == intervalMs) return;
}
var reply = await session.InvokeAsync(
new MxCommandRequest
{
SessionId = session.SessionId,
ClientCorrelationId = Guid.NewGuid().ToString("N"),
Command = new MxCommand
{
Kind = MxCommandKind.SetBufferedUpdateInterval,
SetBufferedUpdateInterval = new SetBufferedUpdateIntervalCommand
{
ServerHandle = serverHandle,
UpdateIntervalMilliseconds = intervalMs,
},
},
},
cancellationToken).ConfigureAwait(false);
if (reply.ProtocolStatus?.Code is not (ProtocolStatusCode.Ok or ProtocolStatusCode.MxaccessFailure))
{
// Don't throw on a soft failure — the SubscribeBulk will still succeed at the
// gw's default cadence, which is functional just not the requested cadence.
// The trace span (PR 6.1) plus the warning here gives ops the signal.
return;
}
lock (_intervalLock)
{
_lastAppliedIntervalMs = intervalMs;
}
}
public async Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
{
if (itemHandles.Count == 0) return;
var session = _session.Session
?? throw new InvalidOperationException(
"GalaxyMxSession is not connected. UnsubscribeBulk called after disconnect.");
var serverHandle = _session.ServerHandle;
await session.UnsubscribeBulkAsync(serverHandle, itemHandles, cancellationToken)
.ConfigureAwait(false);
}
public IAsyncEnumerable<MxEvent> StreamEventsAsync(CancellationToken cancellationToken)
{
var session = _session.Session
?? throw new InvalidOperationException(
"GalaxyMxSession is not connected. StreamEventsAsync called before ConnectAsync.");
return session.StreamEventsAsync(afterWorkerSequence: 0, cancellationToken);
}
}

View File

@@ -0,0 +1,32 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Test seam for the gateway-side Acknowledge call. Production wraps the
/// <c>MxGatewayClient.AcknowledgeAlarmAsync</c> RPC; tests substitute a fake
/// so <see cref="GalaxyDriver.AcknowledgeAsync"/> can be exercised without a
/// running gateway.
/// </summary>
internal interface IGalaxyAlarmAcknowledger
{
/// <summary>
/// Forward a single alarm acknowledgement to the gateway. The gateway
/// translates this to an MxAccess Acknowledge call against the worker's
/// session and returns the native MxStatus on the reply.
/// </summary>
/// <param name="alarmFullReference">
/// Fully-qualified alarm reference (e.g. <c>"Tank01.Level.HiHi"</c>).
/// </param>
/// <param name="comment">Operator-supplied comment forwarded to MxAccess.</param>
/// <param name="operatorUser">
/// Operator principal performing the acknowledgement. Resolved from the
/// OPC UA session by the server-side ACL layer before reaching the driver.
/// </param>
/// <param name="cancellationToken">Cancels the gateway RPC.</param>
Task AcknowledgeAsync(
string alarmFullReference,
string comment,
string operatorUser,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,27 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side seam for one-shot reads. Production implementation (PR 4.4) wraps
/// <c>MxGatewaySession</c>'s SubscribeBulk + StreamEvents path to obtain values; tests
/// substitute a fake returning canned snapshots.
/// </summary>
/// <remarks>
/// The interface is deliberately minimal — no per-tag overload, no continuation
/// points. The driver-side <c>IReadable.ReadAsync</c> contract guarantees a value per
/// requested tag in input order, with status codes carrying the per-tag failure mode
/// (e.g. BadInternalError for transport failure on a single tag, BadOutOfService for
/// a tag the gateway didn't recognise).
/// </remarks>
public interface IGalaxyDataReader
{
/// <summary>
/// Read each <paramref name="fullReferences"/> entry once and return one
/// <see cref="DataValueSnapshot"/> per request entry, in input order.
/// Implementations MUST return the same length as the input — partial-tag
/// failures are encoded as Bad-quality snapshots, not omitted.
/// </summary>
Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,33 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side seam for batched writes. Production implementation routes by
/// <see cref="SecurityClassification"/>: SecuredWrite / VerifiedWrite go through
/// <c>MxCommandKind.WriteSecured</c>, everything else through
/// <c>MxGatewaySession.WriteAsync</c>. Tests substitute a fake to record routing
/// decisions without touching real gw infrastructure.
/// </summary>
public interface IGalaxyDataWriter
{
/// <summary>
/// Write each <paramref name="writes"/> entry; return one
/// <see cref="WriteResult"/> per request entry, in input order. Implementations
/// MUST return the same length as the input — partial-tag failures are encoded
/// as Bad-status results, not omitted.
/// </summary>
/// <param name="writes">Pairs of full reference + value to write.</param>
/// <param name="securityResolver">
/// Maps a full reference to its discovered <see cref="SecurityClassification"/>
/// so the writer can route SecuredWrite / VerifiedWrite tags through the
/// <c>WriteSecured</c> command instead of <c>Write</c>. Returns
/// <see cref="SecurityClassification.FreeAccess"/> when the tag isn't tracked
/// (the safest default — non-secured Write).
/// </param>
/// <param name="cancellationToken">Aborts the in-flight batch.</param>
Task<IReadOnlyList<WriteResult>> WriteAsync(
IReadOnlyList<WriteRequest> writes,
Func<string, SecurityClassification> securityResolver,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,32 @@
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Driver-side seam for subscription lifecycle + the inbound event stream. Production
/// wraps <c>MxGatewaySession.SubscribeBulkAsync</c>, <c>UnsubscribeBulkAsync</c>, and
/// <c>StreamEventsAsync</c>; tests substitute a fake to drive synthetic events through
/// the <see cref="EventPump"/> without a real gw.
/// </summary>
public interface IGalaxySubscriber
{
/// <summary>
/// Subscribe a batch of tag full references. Returns one
/// <see cref="SubscribeResult"/> per request entry, in input order. Failed tags
/// (gateway rejection) carry a non-zero status and an item handle of zero or
/// negative — the caller treats those as per-tag failures rather than a whole-call
/// failure.
/// </summary>
Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken);
/// <summary>Unsubscribe a batch of item handles obtained from <see cref="SubscribeBulkAsync"/>.</summary>
Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken);
/// <summary>
/// Long-running consumer of the gateway's <c>StreamEvents</c> RPC. Each emitted
/// <see cref="MxEvent"/> carries the gw item handle the caller correlates against
/// its <see cref="SubscriptionRegistry"/>.
/// </summary>
IAsyncEnumerable<MxEvent> StreamEventsAsync(CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,55 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Maps a raw MXAccess alarm severity (0-999, MXAccess scale) onto the
/// <see cref="AlarmSeverity"/> ladder + an OPC UA Part 9 numeric severity (1-1000).
/// </summary>
/// <remarks>
/// <para>
/// The four-bucket OPC UA ladder (250 / 500 / 750 / 1000 — Low / Medium / High /
/// Critical) is the same ladder v1's <c>GalaxyAlarmTracker</c> exposed (per
/// <c>docs/v1/AlarmTracking.md</c>). Galaxy templates assign severity values
/// 0-999; the bucket boundaries below match v1 so customers see no
/// surprise re-classification when the v2 path takes over.
/// </para>
/// <para>
/// Out-of-range inputs (negative or &gt;= 1000) are clamped into the nearest
/// bucket rather than rejected. MXAccess occasionally surfaces slightly
/// out-of-range severities for legacy alarm types and we want them to flow
/// through the alarm path rather than disappear at the mapper.
/// </para>
/// </remarks>
internal static class MxAccessSeverityMapper
{
/// <summary>OPC UA Part 9 numeric severity for the Low bucket (0-249 MxAccess).</summary>
public const int OpcUaSeverityLow = 250;
/// <summary>OPC UA Part 9 numeric severity for the Medium bucket (250-499 MxAccess).</summary>
public const int OpcUaSeverityMedium = 500;
/// <summary>OPC UA Part 9 numeric severity for the High bucket (500-749 MxAccess).</summary>
public const int OpcUaSeverityHigh = 750;
/// <summary>OPC UA Part 9 numeric severity for the Critical bucket (750+ MxAccess).</summary>
public const int OpcUaSeverityCritical = 1000;
/// <summary>
/// Translate a raw MXAccess severity into the four-bucket
/// <see cref="AlarmSeverity"/> + OPC UA Part 9 numeric severity tuple.
/// </summary>
public static (AlarmSeverity Bucket, int OpcUaSeverity) Map(int rawMxAccessSeverity)
{
if (rawMxAccessSeverity < 250)
{
return (AlarmSeverity.Low, OpcUaSeverityLow);
}
if (rawMxAccessSeverity < 500)
{
return (AlarmSeverity.Medium, OpcUaSeverityMedium);
}
if (rawMxAccessSeverity < 750)
{
return (AlarmSeverity.High, OpcUaSeverityHigh);
}
return (AlarmSeverity.Critical, OpcUaSeverityCritical);
}
}

View File

@@ -0,0 +1,54 @@
using Google.Protobuf.WellKnownTypes;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Translates gateway-side <see cref="MxValue"/> instances into the boxed CLR objects
/// <c>DataValueSnapshot.Value</c> carries. Mirrors the seven Galaxy data types in
/// <c>DataTypeMap</c> (Boolean, Int32, Int64, Float32, Float64, String, DateTime), plus
/// the array variants exposed by <see cref="MxArray"/>. Unknown / awkward values fall
/// back to the <c>raw_value</c> bytes so a forward-compatible MXAccess deployment
/// doesn't lose data on the wire — the consumer can opt to deserialise the bytes.
/// </summary>
internal static class MxValueDecoder
{
public static object? Decode(MxValue? value)
{
if (value is null) return null;
if (value.IsNull) return null;
return value.KindCase switch
{
MxValue.KindOneofCase.BoolValue => value.BoolValue,
MxValue.KindOneofCase.Int32Value => value.Int32Value,
MxValue.KindOneofCase.Int64Value => value.Int64Value,
MxValue.KindOneofCase.FloatValue => value.FloatValue,
MxValue.KindOneofCase.DoubleValue => value.DoubleValue,
MxValue.KindOneofCase.StringValue => value.StringValue,
MxValue.KindOneofCase.TimestampValue => DecodeTimestamp(value.TimestampValue),
MxValue.KindOneofCase.ArrayValue => DecodeArray(value.ArrayValue),
MxValue.KindOneofCase.RawValue => value.RawValue.ToByteArray(),
_ => null,
};
}
private static DateTime? DecodeTimestamp(Timestamp? ts) => ts?.ToDateTime();
private static object? DecodeArray(MxArray? array)
{
if (array is null) return null;
return array.ValuesCase switch
{
MxArray.ValuesOneofCase.BoolValues => array.BoolValues.Values.ToArray(),
MxArray.ValuesOneofCase.Int32Values => array.Int32Values.Values.ToArray(),
MxArray.ValuesOneofCase.Int64Values => array.Int64Values.Values.ToArray(),
MxArray.ValuesOneofCase.FloatValues => array.FloatValues.Values.ToArray(),
MxArray.ValuesOneofCase.DoubleValues => array.DoubleValues.Values.ToArray(),
MxArray.ValuesOneofCase.StringValues => array.StringValues.Values.ToArray(),
MxArray.ValuesOneofCase.TimestampValues => array.TimestampValues.Values.Select(t => t.ToDateTime()).ToArray(),
_ => null,
};
}
}

View File

@@ -0,0 +1,85 @@
using Google.Protobuf.WellKnownTypes;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Translates boxed CLR values from <c>WriteRequest.Value</c> into gateway-side
/// <see cref="MxValue"/> instances. Inverse of <see cref="MxValueDecoder"/>.
/// Handles the seven Galaxy data types — Boolean, Int32, Int64, Float32, Float64,
/// String, DateTime — and their array variants. Null + unsupported types throw
/// <see cref="ArgumentException"/> so the IWritable caller can fail the write with a
/// clear status code rather than silently mis-typing the wire payload.
/// </summary>
internal static class MxValueEncoder
{
public static MxValue Encode(object? value)
{
if (value is null) return new MxValue { IsNull = true };
switch (value)
{
case bool b: return new MxValue { BoolValue = b };
case sbyte i8: return new MxValue { Int32Value = i8 };
case short i16: return new MxValue { Int32Value = i16 };
case int i32: return new MxValue { Int32Value = i32 };
case byte u8: return new MxValue { Int32Value = u8 };
case ushort u16: return new MxValue { Int32Value = u16 };
case uint u32 when u32 <= int.MaxValue: return new MxValue { Int32Value = (int)u32 };
case long i64: return new MxValue { Int64Value = i64 };
case ulong u64 when u64 <= long.MaxValue: return new MxValue { Int64Value = (long)u64 };
case float f32: return new MxValue { FloatValue = f32 };
case double f64: return new MxValue { DoubleValue = f64 };
case string s: return new MxValue { StringValue = s };
case DateTime dt: return new MxValue { TimestampValue = Timestamp.FromDateTime(EnsureUtc(dt)) };
case DateTimeOffset dto: return new MxValue { TimestampValue = Timestamp.FromDateTimeOffset(dto) };
case bool[] arr: return EncodeArray(arr, (mx, vs) => mx.BoolValues = ToBoolArray(vs));
case int[] arr: return EncodeArray(arr, (mx, vs) => mx.Int32Values = ToInt32Array(vs));
case long[] arr: return EncodeArray(arr, (mx, vs) => mx.Int64Values = ToInt64Array(vs));
case float[] arr: return EncodeArray(arr, (mx, vs) => mx.FloatValues = ToFloatArray(vs));
case double[] arr: return EncodeArray(arr, (mx, vs) => mx.DoubleValues = ToDoubleArray(vs));
case string[] arr: return EncodeArray(arr, (mx, vs) => mx.StringValues = ToStringArray(vs));
case DateTime[] arr: return EncodeArray(arr, (mx, vs) => mx.TimestampValues = ToTimestampArray(vs));
default:
throw new ArgumentException(
$"Cannot encode value of type {value.GetType()} as MxValue. Supported: " +
"bool, int / long (and their unsigned variants), float, double, string, DateTime, " +
"and their 1-D array variants.",
nameof(value));
}
}
private static MxValue EncodeArray<T>(T[] values, Action<MxArray, T[]> populate)
{
var array = new MxArray();
populate(array, values);
array.Dimensions.Add((uint)values.Length);
return new MxValue { ArrayValue = array };
}
private static BoolArray ToBoolArray(bool[] vs) { var a = new BoolArray(); a.Values.AddRange(vs); return a; }
private static Int32Array ToInt32Array(int[] vs) { var a = new Int32Array(); a.Values.AddRange(vs); return a; }
private static Int64Array ToInt64Array(long[] vs) { var a = new Int64Array(); a.Values.AddRange(vs); return a; }
private static FloatArray ToFloatArray(float[] vs) { var a = new FloatArray(); a.Values.AddRange(vs); return a; }
private static DoubleArray ToDoubleArray(double[] vs) { var a = new DoubleArray(); a.Values.AddRange(vs); return a; }
private static StringArray ToStringArray(string[] vs) { var a = new StringArray(); a.Values.AddRange(vs); return a; }
private static TimestampArray ToTimestampArray(DateTime[] vs)
{
var a = new TimestampArray();
foreach (var dt in vs) a.Values.Add(Timestamp.FromDateTime(EnsureUtc(dt)));
return a;
}
/// <summary>
/// <see cref="Timestamp.FromDateTime"/> requires UTC. Convert non-UTC inputs
/// explicitly so a caller passing local time gets predictable wire bytes.
/// </summary>
private static DateTime EnsureUtc(DateTime dt) => dt.Kind switch
{
DateTimeKind.Utc => dt,
DateTimeKind.Local => dt.ToUniversalTime(),
_ => DateTime.SpecifyKind(dt, DateTimeKind.Utc),
};
}

View File

@@ -0,0 +1,268 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Coordinates GalaxyDriver's recovery from gateway transport failure. Drives a
/// state machine — <c>Healthy → TransportLost → Reopening → Replaying → Healthy</c>
/// — and exposes the current state through a snapshot + change event so the
/// driver's <c>DriverHealth</c> reflects <c>Degraded</c> while we're not in
/// <c>Healthy</c>.
/// </summary>
/// <remarks>
/// <para>
/// The supervisor doesn't own the session, the subscription registry, or the
/// event pump. It receives transport-failure signals from the rest of the
/// driver (EventPump throws, a gw RPC raises, the heartbeat times out), runs
/// a one-attempt-at-a-time recovery loop, and lets the rest of the driver
/// continue serving cached state during recovery.
/// </para>
/// <para>
/// <b>Reopen</b>: caller-supplied callback that re-opens the gw session +
/// re-Registers the MXAccess client. Throws on failure.
/// </para>
/// <para>
/// <b>Replay</b>: caller-supplied callback that re-establishes every active
/// subscription. Production wraps gw's <c>ReplaySubscriptionsCommand</c>
/// (mxaccessgw issue #0.3); when that's not available, the callback falls
/// back to walking the SubscriptionRegistry and re-issuing SubscribeBulk for
/// every tracked tag.
/// </para>
/// <para>
/// Backoff is capped exponential — first retry after
/// <see cref="ReconnectOptions.InitialBackoff"/>, doubled per failed attempt,
/// capped at <see cref="ReconnectOptions.MaxBackoff"/>. Persistent failures
/// hold the supervisor in <c>Reopening</c> indefinitely; the supervisor never
/// gives up on its own — operators / Phase 6.4 soak handle that policy.
/// </para>
/// </remarks>
public sealed class ReconnectSupervisor : IDisposable
{
/// <summary>Recovery state machine.</summary>
public enum State
{
Healthy,
TransportLost,
Reopening,
Replaying,
}
private readonly Func<CancellationToken, Task> _reopen;
private readonly Func<CancellationToken, Task> _replay;
private readonly ReconnectOptions _options;
private readonly ILogger _logger;
private readonly Func<int, TimeSpan, TimeSpan, TimeSpan>? _backoffDelay;
private readonly Lock _stateLock = new();
private State _state = State.Healthy;
private string? _lastError;
private DateTime? _lastTransitionUtc;
private Task? _recoveryLoop;
private CancellationTokenSource? _loopCts;
private bool _disposed;
/// <summary>Fires after every state transition.</summary>
public event EventHandler<StateTransition>? StateChanged;
public ReconnectSupervisor(
Func<CancellationToken, Task> reopen,
Func<CancellationToken, Task> replay,
ReconnectOptions? options = null,
ILogger? logger = null,
Func<int, TimeSpan, TimeSpan, TimeSpan>? backoffDelay = null)
{
_reopen = reopen ?? throw new ArgumentNullException(nameof(reopen));
_replay = replay ?? throw new ArgumentNullException(nameof(replay));
_options = options ?? new ReconnectOptions();
_logger = logger ?? NullLogger.Instance;
_backoffDelay = backoffDelay;
}
/// <summary>Current state. Healthy = fully recovered + subscriptions live.</summary>
public State CurrentState
{
get { lock (_stateLock) return _state; }
}
/// <summary>True when CurrentState != Healthy. Drivers map this to DriverState.Degraded.</summary>
public bool IsDegraded
{
get { lock (_stateLock) return _state != State.Healthy; }
}
public string? LastError
{
get { lock (_stateLock) return _lastError; }
}
public DateTime? LastTransitionUtc
{
get { lock (_stateLock) return _lastTransitionUtc; }
}
/// <summary>
/// Notify the supervisor that a gw transport failure has been observed. Idempotent —
/// repeated calls during an in-flight recovery do not start a parallel loop. The
/// first call spawns a background task that drives reopen → replay until it
/// succeeds or <see cref="Dispose"/> cancels it.
/// </summary>
public void ReportTransportFailure(Exception cause)
{
ArgumentNullException.ThrowIfNull(cause);
ObjectDisposedException.ThrowIf(_disposed, this);
lock (_stateLock)
{
_lastError = cause.Message;
if (_state != State.Healthy)
{
// Already recovering — nothing else to do.
_logger.LogDebug("Transport failure reported during {State}: {Message}", _state, cause.Message);
return;
}
TransitionLocked(State.TransportLost, cause.Message);
_loopCts = new CancellationTokenSource();
_recoveryLoop = Task.Run(() => RecoveryLoopAsync(_loopCts.Token));
}
}
/// <summary>
/// Wait until the current recovery cycle reaches Healthy or the supplied token
/// is cancelled. Returns immediately when already Healthy. Useful for tests
/// and for orchestration that wants to gate calls on recovery completing.
/// </summary>
public async Task WaitForHealthyAsync(CancellationToken cancellationToken)
{
while (!cancellationToken.IsCancellationRequested && IsDegraded)
{
await Task.Delay(50, cancellationToken).ConfigureAwait(false);
}
}
private async Task RecoveryLoopAsync(CancellationToken ct)
{
var attempt = 0;
while (!ct.IsCancellationRequested)
{
attempt++;
if (attempt > 1)
{
var delay = ComputeBackoff(attempt);
_logger.LogInformation(
"Galaxy reconnect attempt {Attempt} — waiting {Delay} before retry", attempt, delay);
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
catch (OperationCanceledException) { return; }
}
// === Reopening phase ===
lock (_stateLock) TransitionLocked(State.Reopening, _lastError);
try
{
await _reopen(ct).ConfigureAwait(false);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
catch (Exception ex)
{
_logger.LogWarning(ex, "Galaxy reopen failed (attempt {Attempt})", attempt);
lock (_stateLock) { _lastError = ex.Message; }
continue; // back to backoff + retry
}
// === Replaying phase ===
lock (_stateLock) TransitionLocked(State.Replaying, _lastError);
try
{
await _replay(ct).ConfigureAwait(false);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested) { return; }
catch (Exception ex)
{
_logger.LogWarning(ex, "Galaxy replay failed (attempt {Attempt})", attempt);
lock (_stateLock) { _lastError = ex.Message; }
continue; // back to backoff + retry
}
// === Done ===
lock (_stateLock)
{
_lastError = null;
TransitionLocked(State.Healthy, null);
}
_logger.LogInformation("Galaxy reconnect succeeded after {Attempt} attempt(s)", attempt);
return;
}
}
private TimeSpan ComputeBackoff(int attempt)
{
if (_backoffDelay is not null)
return _backoffDelay(attempt, _options.InitialBackoff, _options.MaxBackoff);
// Standard capped exponential — InitialBackoff * 2^(attempt-2), capped at MaxBackoff.
// Attempt 2 → InitialBackoff, attempt 3 → 2x, attempt 4 → 4x, etc.
var multiplier = Math.Min(1L << Math.Max(0, attempt - 2), int.MaxValue);
var ticks = _options.InitialBackoff.Ticks * multiplier;
if (ticks <= 0 || ticks > _options.MaxBackoff.Ticks) ticks = _options.MaxBackoff.Ticks;
return TimeSpan.FromTicks(ticks);
}
private void TransitionLocked(State next, string? cause)
{
if (next == _state) return;
var previous = _state;
_state = next;
_lastTransitionUtc = DateTime.UtcNow;
var transition = new StateTransition(previous, next, cause, _lastTransitionUtc.Value);
try { StateChanged?.Invoke(this, transition); }
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy reconnect StateChanged handler threw — continuing.");
}
}
public void Dispose()
{
if (_disposed) return;
_disposed = true;
CancellationTokenSource? cts;
Task? loop;
lock (_stateLock) { cts = _loopCts; loop = _recoveryLoop; _loopCts = null; _recoveryLoop = null; }
cts?.Cancel();
if (loop is not null)
{
try { loop.GetAwaiter().GetResult(); } catch { /* shutdown */ }
}
cts?.Dispose();
}
}
/// <summary>
/// One state transition observed by the supervisor.
/// </summary>
public sealed record StateTransition(
ReconnectSupervisor.State Previous,
ReconnectSupervisor.State Next,
string? Cause,
DateTime AtUtc);
/// <summary>
/// Knobs for the supervisor's backoff. <see cref="ReconnectOptions"/> on the driver
/// options record (PR 4.0) maps onto this — they're separate types so the supervisor
/// can be exercised in tests without the full driver options surface.
/// </summary>
public sealed record ReconnectOptions(
TimeSpan? InitialBackoffOverride = null,
TimeSpan? MaxBackoffOverride = null)
{
public TimeSpan InitialBackoff => InitialBackoffOverride ?? TimeSpan.FromMilliseconds(500);
public TimeSpan MaxBackoff => MaxBackoffOverride ?? TimeSpan.FromSeconds(30);
}

View File

@@ -0,0 +1,118 @@
using Microsoft.Extensions.Logging;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Maps the gateway's <see cref="MxStatusProxy"/> (raw MXAccess HRESULT + category bits)
/// to OPC UA <c>StatusCode</c> uints. Replaces the legacy
/// <c>MxAccessGalaxyBackend.ToWire</c> heuristic (Quality &gt;= 192 → Good, else Uncertain)
/// with an explicit table that preserves specific codes (BadNotConnected, OutOfService,
/// UncertainSubNormal, etc.) instead of collapsing to category buckets.
/// </summary>
/// <remarks>
/// OPC DA quality bytes are 16-bit values arranged as <c>[QQSSSSSSLLNNNN]</c>:
/// Q = quality category (Bad/Uncertain/Good = 0/1/3), S = substatus, L = limit, N = vendor.
/// This mapper consumes the LOW byte (where the Q+S bits live) — the same byte the legacy
/// Wonderware Historian SDK exposed as the raw quality byte. Category-only fallback paths
/// handle deployment versions of MXAccess that surface unfamiliar substatuses.
///
/// Unknown substatus values fall back to the matching category bucket (<c>Good</c>,
/// <c>Uncertain</c>, <c>Bad</c>) and emit a single diagnostic log line per session via
/// the supplied logger so field captures can extend the table.
/// </remarks>
internal static class StatusCodeMap
{
// OPC UA Part 4 standard StatusCodes — top-byte categories are 0x00 (Good),
// 0x40 (Uncertain), 0x80 (Bad). Specific codes layer onto the category byte.
public const uint Good = 0x00000000u;
public const uint GoodLocalOverride = 0x00D80000u;
public const uint Uncertain = 0x40000000u;
public const uint UncertainLastUsableValue = 0x40A40000u;
public const uint UncertainSensorNotAccurate = 0x408D0000u;
public const uint UncertainEngineeringUnitsExceeded = 0x408E0000u;
public const uint UncertainSubNormal = 0x408F0000u;
public const uint Bad = 0x80000000u;
public const uint BadConfigurationError = 0x80890000u;
public const uint BadNotConnected = 0x808A0000u;
public const uint BadDeviceFailure = 0x808B0000u;
public const uint BadSensorFailure = 0x808C0000u;
public const uint BadCommunicationError = 0x80050000u;
public const uint BadOutOfService = 0x808D0000u;
public const uint BadWaitingForInitialData = 0x80320000u;
public const uint BadInternalError = 0x80020000u;
/// <summary>
/// Map a raw OPC DA quality byte (the low byte of an OPC DA <c>OpcQuality</c> ushort,
/// which is what Wonderware Historian + MXAccess surface as <c>OPCITEMSTATE.qLong</c>'s
/// low byte) to the OPC UA StatusCode uint.
/// </summary>
public static uint FromQualityByte(byte q, ILogger? logger = null) => q switch
{
// Good family — top two bits 11b (192-255).
192 => Good,
216 => GoodLocalOverride,
// Uncertain family — top two bits 01b (64-127).
64 => Uncertain,
68 => UncertainLastUsableValue,
80 => UncertainSensorNotAccurate,
84 => UncertainEngineeringUnitsExceeded,
88 => UncertainSubNormal,
// Bad family — top two bits 00b (0-63).
0 => Bad,
4 => BadConfigurationError,
8 => BadNotConnected,
12 => BadDeviceFailure,
16 => BadSensorFailure,
20 => BadCommunicationError,
24 => BadOutOfService,
32 => BadWaitingForInitialData,
_ => Categorize(q, logger),
};
/// <summary>
/// Map a gateway-reported <see cref="MxStatusProxy"/> to OPC UA StatusCode. Honors
/// the success flag, then the detail byte (treated as a quality substatus), with a
/// transport-error fallback for status rows whose detected_by indicates the failure
/// happened before the MXAccess call ran.
/// </summary>
public static uint FromMxStatus(MxStatusProxy? status, ILogger? logger = null)
{
if (status is null) return Good;
if (status.Success != 0) return Good;
// Detail field carries the substatus when the worker translated MX-style codes;
// when zero, infer from category + detected_by.
var detail = (byte)(status.Detail & 0xFF);
if (detail != 0) return FromQualityByte(detail, logger);
// detected_by != Mxaccess (raw_detected_by != the MXAccess source enum) implies
// the failure happened pre-call (gateway, worker, transport) — surface as a
// communication error rather than a generic Bad.
if (status.RawDetectedBy != 0) return BadCommunicationError;
return Bad;
}
private static uint Categorize(byte q, ILogger? logger)
{
if (q >= 192) { Log(logger, q, "Good"); return Good; }
if (q >= 64) { Log(logger, q, "Uncertain"); return Uncertain; }
Log(logger, q, "Bad");
return Bad;
}
private static void Log(ILogger? logger, byte q, string bucket)
{
// Best-effort diagnostic so field captures can extend the table — once per bucket
// per session is plenty (the LogWarning level is rate-limited by Serilog filters
// in production).
logger?.LogWarning(
"Unrecognised MXAccess quality byte 0x{Q:X2} — falling back to {Bucket} category. " +
"Field capture welcome — extend StatusCodeMap.FromQualityByte.", q, bucket);
}
}

View File

@@ -0,0 +1,106 @@
using System.Collections.Concurrent;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// Bookkeeping for live subscriptions. Maps each driver-issued <c>SubscriptionId</c> to the
/// set of (full-reference, gw item-handle) pairs the gateway returned, and maintains the
/// reverse map (item-handle → set of driver subscriptions) so the
/// <see cref="EventPump"/> can fan out a single OnDataChange event to every driver
/// subscription that includes the changed tag.
/// </summary>
/// <remarks>
/// A tag may legitimately appear in multiple driver subscriptions (separate clients or
/// OPC UA monitored items observing the same Galaxy attribute). Using a single shared
/// gw subscription per session and fanning out on the driver side keeps the gateway's
/// work bounded; the reverse map is the fan-out index.
/// </remarks>
internal sealed class SubscriptionRegistry
{
private readonly ConcurrentDictionary<long, SubscriptionEntry> _bySubscriptionId = new();
private readonly ConcurrentDictionary<int, ConcurrentBag<long>> _subscribersByItemHandle = new();
private long _nextSubscriptionId;
public int TrackedSubscriptionCount => _bySubscriptionId.Count;
public int TrackedItemHandleCount => _subscribersByItemHandle.Count;
/// <summary>Allocate a fresh subscription id. Monotonic; unique per registry lifetime.</summary>
public long NextSubscriptionId() => Interlocked.Increment(ref _nextSubscriptionId);
/// <summary>
/// Register a subscription and the per-tag item handles the gateway returned for it.
/// Failed tags (item handle = 0 or negative) are stored anyway so unsubscribe can
/// emit per-tag UnsubscribeBulk for the ones that did succeed.
/// </summary>
public void Register(long subscriptionId, IReadOnlyList<TagBinding> bindings)
{
var entry = new SubscriptionEntry(subscriptionId, bindings);
_bySubscriptionId[subscriptionId] = entry;
foreach (var binding in bindings)
{
if (binding.ItemHandle <= 0) continue; // failed gw subscribe — no events expected
_subscribersByItemHandle.AddOrUpdate(
binding.ItemHandle,
_ => [subscriptionId],
(_, bag) => { bag.Add(subscriptionId); return bag; });
}
}
/// <summary>
/// Remove a subscription. Returns the bindings the caller should pass to
/// <c>UnsubscribeBulkAsync</c>; null when the id was never registered.
/// </summary>
public IReadOnlyList<TagBinding>? Remove(long subscriptionId)
{
if (!_bySubscriptionId.TryRemove(subscriptionId, out var entry)) return null;
foreach (var binding in entry.Bindings)
{
if (binding.ItemHandle <= 0) continue;
if (!_subscribersByItemHandle.TryGetValue(binding.ItemHandle, out var bag)) continue;
// Filter the bag to drop this subscription id. ConcurrentBag has no Remove —
// rebuild it from the remaining entries. The contention here is bounded by
// the number of tags in the dropped subscription.
var remaining = new ConcurrentBag<long>(bag.Where(id => id != subscriptionId));
if (remaining.IsEmpty) _subscribersByItemHandle.TryRemove(binding.ItemHandle, out _);
else _subscribersByItemHandle[binding.ItemHandle] = remaining;
}
return entry.Bindings;
}
/// <summary>
/// Look up the (subscription id, full reference) pairs that should receive an
/// OnDataChange for the given gw item handle. Returns empty when nobody subscribes.
/// </summary>
public IReadOnlyList<(long SubscriptionId, string FullReference)> ResolveSubscribers(int itemHandle)
{
if (!_subscribersByItemHandle.TryGetValue(itemHandle, out var bag)) return [];
// Each subscription may include the tag once. Walk every active subscription that
// claims this handle and pull the full ref from its binding list.
var result = new List<(long, string)>();
foreach (var subId in bag.Distinct())
{
if (!_bySubscriptionId.TryGetValue(subId, out var entry)) continue;
var binding = entry.Bindings.FirstOrDefault(b => b.ItemHandle == itemHandle);
if (binding is { FullReference: { } fullRef })
result.Add((subId, fullRef));
}
return result;
}
/// <summary>Snapshot every active binding for diagnostic output.</summary>
public IReadOnlyList<TagBinding> SnapshotAllBindings() =>
[.. _bySubscriptionId.Values.SelectMany(entry => entry.Bindings)];
private sealed record SubscriptionEntry(long SubscriptionId, IReadOnlyList<TagBinding> Bindings);
}
/// <summary>
/// One (full reference, gw item handle) pair returned by SubscribeBulk. Item handle is
/// zero or negative when the gateway rejected this individual tag (bad name, duplicate);
/// the registry keeps the binding so the caller can surface a per-tag failure status.
/// </summary>
internal sealed record TagBinding(string FullReference, int ItemHandle);

View File

@@ -0,0 +1,54 @@
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
/// per gw write batch. Tags secured-write counts so ops can see the routing-by-
/// classification split (FreeAccess/Operate vs Tune/Configure) without re-reading
/// the discovery dictionary.
/// </summary>
internal sealed class TracedGalaxyDataWriter(IGalaxyDataWriter inner, string clientName) : IGalaxyDataWriter
{
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
IReadOnlyList<WriteRequest> writes,
Func<string, SecurityClassification> securityResolver,
CancellationToken cancellationToken)
{
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.write");
activity?.SetTag("galaxy.client", clientName);
activity?.SetTag("galaxy.tag_count", writes.Count);
if (activity is { IsAllDataRequested: true })
{
// Counting the secured-write split is cheap (one resolver call per request)
// and only happens when a tracing listener is actively recording — keeps the
// hot path free when no one's listening.
var securedCount = 0;
foreach (var w in writes)
{
var sc = securityResolver(w.FullReference);
if (sc is SecurityClassification.Tune
or SecurityClassification.Configure
or SecurityClassification.VerifiedWrite)
{
securedCount++;
}
}
activity.SetTag("galaxy.secured_write_count", securedCount);
}
try
{
var results = await inner.WriteAsync(writes, securityResolver, cancellationToken)
.ConfigureAwait(false);
activity?.SetTag("galaxy.success_count", results.Count(r => r.StatusCode < 0x80000000u));
return results;
}
catch (Exception ex)
{
activity.RecordError(ex);
throw;
}
}
}

View File

@@ -0,0 +1,91 @@
using System.Runtime.CompilerServices;
using MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <summary>
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
/// per gw subscription RPC. Wraps the production <see cref="GatewayGalaxySubscriber"/>;
/// tests substitute a fake at the same seam without taking the tracing overhead.
/// </summary>
internal sealed class TracedGalaxySubscriber(IGalaxySubscriber inner, string clientName) : IGalaxySubscriber
{
public async Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
{
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.subscribe_bulk");
activity?.SetTag("galaxy.client", clientName);
activity?.SetTag("galaxy.tag_count", fullReferences.Count);
activity?.SetTag("galaxy.buffered_interval_ms", bufferedUpdateIntervalMs);
try
{
var results = await inner.SubscribeBulkAsync(fullReferences, bufferedUpdateIntervalMs, cancellationToken)
.ConfigureAwait(false);
activity?.SetTag("galaxy.success_count", results.Count(r => r.WasSuccessful));
return results;
}
catch (Exception ex)
{
activity.RecordError(ex);
throw;
}
}
public async Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
{
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.unsubscribe_bulk");
activity?.SetTag("galaxy.client", clientName);
activity?.SetTag("galaxy.tag_count", itemHandles.Count);
try
{
await inner.UnsubscribeBulkAsync(itemHandles, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex)
{
activity.RecordError(ex);
throw;
}
}
/// <summary>
/// Streaming RPC — one parent span covers the entire stream lifetime. Per-event
/// spans would dominate the trace volume at 50k tags / 1Hz; ops gets per-event
/// visibility through <see cref="EventPump"/>'s metrics in PR 6.2 instead.
/// </summary>
public async IAsyncEnumerable<MxEvent> StreamEventsAsync(
[EnumeratorCancellation] CancellationToken cancellationToken)
{
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.stream_events");
activity?.SetTag("galaxy.client", clientName);
IAsyncEnumerator<MxEvent>? enumerator = null;
try
{
enumerator = inner.StreamEventsAsync(cancellationToken).GetAsyncEnumerator(cancellationToken);
var eventCount = 0L;
while (true)
{
bool moveNext;
try
{
moveNext = await enumerator.MoveNextAsync().ConfigureAwait(false);
}
catch (Exception ex)
{
activity.RecordError(ex);
activity?.SetTag("galaxy.event_count", eventCount);
throw;
}
if (!moveNext) break;
eventCount++;
yield return enumerator.Current;
}
activity?.SetTag("galaxy.event_count", eventCount);
}
finally
{
if (enumerator is not null) await enumerator.DisposeAsync().ConfigureAwait(false);
}
}
}

View File

@@ -0,0 +1,33 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Platforms>AnyCPU;x64</Platforms>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<LangVersion>latest</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<NoWarn>$(NoWarn);CS1591</NoWarn>
<RootNamespace>ZB.MOM.WW.OtOpcUa.Driver.Galaxy</RootNamespace>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\Core\ZB.MOM.WW.OtOpcUa.Core.Abstractions\ZB.MOM.WW.OtOpcUa.Core.Abstractions.csproj"/>
<ProjectReference Include="..\..\Core\ZB.MOM.WW.OtOpcUa.Core\ZB.MOM.WW.OtOpcUa.Core.csproj"/>
<!-- mxaccessgw .NET client. Path-based ProjectReference because both repos sit
side-by-side on the dev box; long-term we'll consume MxGateway.Client as a
NuGet package. PR 4.W revisits the dependency shape before parity gating. -->
<ProjectReference Include="..\..\..\..\mxaccessgw\clients\dotnet\MxGateway.Client\MxGateway.Client.csproj"/>
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests"/>
</ItemGroup>
<ItemGroup>
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-37gx-xxp4-5rgx"/>
<NuGetAuditSuppress Include="https://github.com/advisories/GHSA-w3x6-4m5h-cxqf"/>
</ItemGroup>
</Project>