diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs
index 49d0d31d..726b88a1 100644
--- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs
+++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs
@@ -2,18 +2,32 @@ using System.Diagnostics;
using System.Net.Sockets;
using System.Text.Json;
using System.Text.Json.Serialization;
+using Grpc.Core;
+using ZB.MOM.WW.MxGateway.Client;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
///
-/// Cheap TCP-connect probe for the -shaped driver config.
-/// Parses the Gateway.Endpoint gRPC endpoint (e.g. http://host:5001 or
-/// host:5001), opens a socket and closes immediately. Surfaces a green tick +
-/// latency on success; red chip + SocketError on failure; "timed out" on the caller's
-/// cancellation. Does NOT exchange any gRPC frames — a richer gRPC ping probe is a
-/// documented follow-up.
+/// Probe for the -shaped driver config. Parses the
+/// Gateway.Endpoint gRPC endpoint (e.g. http://host:5120 or host:5120),
+/// does a fast TCP-connect preflight to fail closed ports quickly, then issues a lightweight,
+/// read-only gRPC ping — the GalaxyRepository.TestConnection unary RPC — against the
+/// mxaccessgw to confirm the remote is actually a live gateway speaking gRPC. The channel is
+/// built the same way the driver builds it (cleartext for http://, TLS for
+/// https://), and the result is classified by the gRPC :
+///
+/// - OK → reachable, gateway confirmed.
+/// - Unauthenticated / PermissionDenied → ALSO reachable: an auth
+/// rejection proves a live gateway gRPC server answered. The probe deliberately does
+/// not resolve secrets — it sends whatever API-key string is in the config
+/// (possibly an unresolved env:/file: ref or empty), so authentication is
+/// expected to be rejected and that rejection is the positive signal.
+/// - Unavailable / transport error / DeadlineExceeded → handshake
+/// failed (the port answered TCP but did not complete a gRPC handshake).
+///
+/// The ping is strictly read-only and never mutates gateway state.
///
public sealed class GalaxyDriverProbe : IDriverProbe
{
@@ -39,13 +53,12 @@ public sealed class GalaxyDriverProbe : IDriverProbe
if (string.IsNullOrWhiteSpace(host) || port <= 0)
return new(false, "Config has no host/port to probe.", null);
+ // --- TCP preflight: fast-fail for closed ports / unreachable hosts ---
var sw = Stopwatch.StartNew();
try
{
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
await socket.ConnectAsync(host, port, ct);
- sw.Stop();
- return new(true, null, sw.Elapsed);
}
catch (SocketException ex)
{
@@ -59,6 +72,107 @@ public sealed class GalaxyDriverProbe : IDriverProbe
{
return new(false, ex.Message, null);
}
+
+ // --- gRPC ping: confirm the port is a live mxaccessgw speaking gRPC ---
+ // TestConnection is a read-only metadata RPC (it asks the gateway whether it can reach
+ // the Galaxy Repository SQL Server). We don't care about its boolean result — only that
+ // the gateway answered the gRPC call. A successful return ⇒ OK; an auth rejection still
+ // proves a live gateway; a transport failure ⇒ the port isn't a gateway. We never resolve
+ // the API-key secret ref here (the host owns secret resolution): whatever string is in the
+ // config is sent as-is, so an unresolved/empty key surfaces as an auth rejection = reachable.
+ GalaxyRepositoryClient? client = null;
+ try
+ {
+ client = GalaxyRepositoryClient.Create(BuildProbeClientOptions(opts.Gateway, timeout));
+
+ using var deadlineCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+ deadlineCts.CancelAfter(timeout);
+
+ await client.TestConnectionAsync(deadlineCts.Token).ConfigureAwait(false);
+ sw.Stop();
+
+ // No RpcException ⇒ the gateway answered the gRPC call successfully.
+ var (ok, message) = ClassifyRpc(StatusCode.OK, host, port);
+ return new(ok, message, sw.Elapsed);
+ }
+ catch (RpcException ex)
+ {
+ sw.Stop();
+ var (ok, message) = ClassifyRpc(ex.StatusCode, host, port);
+ return new(ok, message, ok ? sw.Elapsed : null);
+ }
+ catch (OperationCanceledException) when (ct.IsCancellationRequested)
+ {
+ // The caller cancelled (their own timeout / shutdown) — surface a timeout message.
+ return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
+ }
+ catch (OperationCanceledException)
+ {
+ // Our own per-call deadline fired before the gateway answered — treat as a failed
+ // handshake (the port answered TCP but didn't complete a gRPC handshake in time).
+ var (_, message) = ClassifyRpc(StatusCode.DeadlineExceeded, host, port);
+ return new(false, message, null);
+ }
+ catch (Exception ex)
+ {
+ // Transport / channel construction failures (e.g. an https endpoint mismatch, a TLS
+ // negotiation failure, or a non-gRPC server) — reachable on TCP but not a gateway.
+ return new(false,
+ $"Reachable at {host}:{port} but gateway gRPC handshake failed: {ex.Message}",
+ null);
+ }
+ finally
+ {
+ if (client is not null)
+ await client.DisposeAsync().ConfigureAwait(false);
+ }
+ }
+
+ ///
+ /// Maps a gRPC from the ping RPC to a probe outcome. Factored out
+ /// as a pure helper so the classification is unit-testable without a live gateway. An
+ /// auth rejection ( /
+ /// ) counts as reachable because it proves a live
+ /// gateway gRPC server answered.
+ ///
+ internal static (bool ok, string message) ClassifyRpc(StatusCode code, string host, int port) => code switch
+ {
+ StatusCode.OK => (true, "gateway gRPC OK"),
+ StatusCode.Unauthenticated or StatusCode.PermissionDenied =>
+ (true, "gateway reachable & speaking gRPC (auth not checked)"),
+ _ => (false, $"Reachable at {host}:{port} but gateway gRPC handshake failed: {code}"),
+ };
+
+ ///
+ /// Builds the gRPC client options for the probe ping from the gateway config WITHOUT
+ /// resolving the API-key secret ref (the host owns secret resolution; the probe sends the
+ /// raw config string). Mirrors the driver's channel build — UseTls selects TLS vs
+ /// cleartext, and the CA path is honoured for TLS — but caps the call/connect timeouts to
+ /// the probe budget and uses a single attempt so an unreachable host fails fast.
+ ///
+ private static MxGatewayClientOptions BuildProbeClientOptions(GalaxyGatewayOptions gw, TimeSpan timeout)
+ {
+ // The gw client's Validate() rejects an empty API key. The config's secret ref may be an
+ // unresolved env:/file: string (non-empty → passes) or, in degenerate configs, empty. Send
+ // a non-empty placeholder only when the config carries no key string at all, so the gateway
+ // can still answer (and reject) the call. We never resolve env:/file: refs to real secrets.
+ var apiKey = string.IsNullOrWhiteSpace(gw.ApiKeySecretRef) ? "probe" : gw.ApiKeySecretRef;
+
+ // Keep both timeouts inside the probe budget so the ping fails fast on an unreachable host.
+ var budget = timeout > TimeSpan.Zero ? timeout : TimeSpan.FromSeconds(1);
+
+ return new MxGatewayClientOptions
+ {
+ Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
+ ApiKey = apiKey,
+ UseTls = gw.UseTls,
+ CaCertificatePath = gw.CaCertificatePath,
+ ConnectTimeout = budget,
+ DefaultCallTimeout = budget,
+ // One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded)
+ // retries; the linked deadline above bounds the whole call regardless.
+ Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 },
+ };
}
private static (string host, int port) ExtractTarget(GalaxyDriverOptions opts)
@@ -66,7 +180,7 @@ public sealed class GalaxyDriverProbe : IDriverProbe
var endpoint = opts.Gateway.Endpoint;
if (string.IsNullOrWhiteSpace(endpoint)) return (string.Empty, 0);
- // Try absolute URI first (e.g. "http://hostname:5001" or "https://hostname:5001").
+ // Try absolute URI first (e.g. "http://hostname:5120" or "https://hostname:5120").
if (Uri.TryCreate(endpoint, UriKind.Absolute, out var uri))
{
var host = uri.Host;
diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs
new file mode 100644
index 00000000..48d7cd2f
--- /dev/null
+++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs
@@ -0,0 +1,163 @@
+using System.Net;
+using System.Net.Sockets;
+using Grpc.Core;
+using Shouldly;
+using Xunit;
+
+namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests;
+
+///
+/// Unit tests for . The probe does a TCP preflight followed by
+/// a lightweight gRPC ping (the read-only GalaxyRepository.TestConnection RPC) against
+/// the mxaccessgw, classifying the resulting . An auth rejection
+/// ( / )
+/// counts as reachable because it proves a live gRPC server answered. The
+/// mapping is unit-tested directly here without a
+/// live gateway; the happy OK + auth-rejected paths are verified live against the real
+/// gateway (10.100.0.48:5120).
+///
+[Trait("Category", "Unit")]
+public sealed class GalaxyDriverProbeTests
+{
+ private readonly GalaxyDriverProbe _probe = new();
+
+ // A complete-enough GalaxyDriverOptions JSON skeleton; the {0} placeholder is the
+ // gateway endpoint. All other nested records carry their required positional members.
+ private static string ConfigJson(string endpoint, bool useTls = false) =>
+ $$"""
+ {
+ "Gateway": {
+ "Endpoint": "{{endpoint}}",
+ "ApiKeySecretRef": "env:GALAXY_MXGW_API_KEY",
+ "UseTls": {{(useTls ? "true" : "false")}}
+ },
+ "MxAccess": { "ClientName": "probe-test" },
+ "Repository": {},
+ "Reconnect": {}
+ }
+ """;
+
+ // ── 1. Invalid JSON ──────────────────────────────────────────────────────────
+
+ /// Invalid JSON returns Ok=false with a message containing "invalid".
+ [Fact]
+ public async Task InvalidJson_returns_false_with_invalid_message()
+ {
+ var result = await _probe.ProbeAsync(
+ "not-json",
+ TimeSpan.FromSeconds(2),
+ TestContext.Current.CancellationToken);
+
+ result.Ok.ShouldBeFalse();
+ result.Message.ShouldNotBeNull();
+ result.Message!.ShouldContain("invalid", Case.Insensitive);
+ result.Latency.ShouldBeNull();
+ }
+
+ // ── 2. Config with no endpoint ───────────────────────────────────────────────
+
+ ///
+ /// Config JSON whose Gateway.Endpoint is empty returns Ok=false with a message
+ /// indicating no host/port was found.
+ ///
+ [Fact]
+ public async Task NoEndpoint_returns_false_with_no_host_port_message()
+ {
+ var result = await _probe.ProbeAsync(
+ ConfigJson(endpoint: ""),
+ TimeSpan.FromSeconds(2),
+ TestContext.Current.CancellationToken);
+
+ result.Ok.ShouldBeFalse();
+ result.Message.ShouldNotBeNull();
+ result.Message!.ShouldContain("no host", Case.Insensitive);
+ result.Latency.ShouldBeNull();
+ }
+
+ // ── 3. Unreachable closed port ────────────────────────────────────────────────
+
+ ///
+ /// Pointing at a TCP port that is not open returns Ok=false. Depending on how far the
+ /// probe gets, the message is the TCP "Connect failed", a gRPC "handshake failed", or a
+ /// "timed out" message — all of which are acceptable failure signals for an unreachable
+ /// gateway.
+ ///
+ [Fact]
+ public async Task ClosedPort_returns_false()
+ {
+ // Bind a port, capture it, then release it so nothing is listening when we probe.
+ var listener = new TcpListener(IPAddress.Loopback, 0);
+ listener.Start();
+ var port = ((IPEndPoint)listener.LocalEndpoint).Port;
+ listener.Stop();
+
+ var endpoint = $"http://127.0.0.1:{port}";
+
+ using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
+ var result = await _probe.ProbeAsync(
+ ConfigJson(endpoint), TimeSpan.FromSeconds(3), cts.Token);
+
+ result.Ok.ShouldBeFalse();
+ result.Message.ShouldNotBeNull();
+ result.Message!.ShouldSatisfyAllConditions(
+ () => (result.Message.Contains("Connect failed", StringComparison.OrdinalIgnoreCase)
+ || result.Message.Contains("handshake failed", StringComparison.OrdinalIgnoreCase)
+ || result.Message.Contains("timed out", StringComparison.OrdinalIgnoreCase))
+ .ShouldBeTrue($"unexpected message: '{result.Message}'"));
+ }
+
+ // ── 4. ClassifyRpc mapping (pure, no live gateway) ───────────────────────────
+
+ /// A gRPC OK status maps to a reachable, gateway-confirmed result.
+ [Fact]
+ public void ClassifyRpc_Ok_is_reachable()
+ {
+ var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.OK, "host", 5120);
+ ok.ShouldBeTrue();
+ message.ShouldBe("gateway gRPC OK");
+ }
+
+ ///
+ /// A gRPC Unauthenticated status counts as reachable — an auth rejection proves a
+ /// live gateway gRPC server answered.
+ ///
+ [Fact]
+ public void ClassifyRpc_Unauthenticated_is_reachable_auth_not_checked()
+ {
+ var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unauthenticated, "host", 5120);
+ ok.ShouldBeTrue();
+ message.ShouldContain("auth not checked", Case.Insensitive);
+ }
+
+ /// A gRPC PermissionDenied status also counts as reachable.
+ [Fact]
+ public void ClassifyRpc_PermissionDenied_is_reachable_auth_not_checked()
+ {
+ var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.PermissionDenied, "host", 5120);
+ ok.ShouldBeTrue();
+ message.ShouldContain("auth not checked", Case.Insensitive);
+ }
+
+ ///
+ /// A gRPC Unavailable status is a transport failure — Ok=false with a message that
+ /// names the host, port and the failing status code.
+ ///
+ [Fact]
+ public void ClassifyRpc_Unavailable_is_not_reachable()
+ {
+ var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unavailable, "host", 5120);
+ ok.ShouldBeFalse();
+ message.ShouldContain("Unavailable");
+ message.ShouldContain("host:5120");
+ }
+
+ /// A gRPC DeadlineExceeded status is likewise a handshake failure.
+ [Fact]
+ public void ClassifyRpc_DeadlineExceeded_is_not_reachable()
+ {
+ var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.DeadlineExceeded, "host", 5120);
+ ok.ShouldBeFalse();
+ message.ShouldContain("DeadlineExceeded");
+ message.ShouldContain("host:5120");
+ }
+}