From 2d688c2a6d135cde6032a5d87920514fc12dccfd Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 16 Jun 2026 06:48:40 -0400 Subject: [PATCH] feat(probe): Galaxy Test-Connect does a gRPC ping (auth-rejection counts as reachable) --- .../GalaxyDriverProbe.cs | 132 +++++++++++++- .../GalaxyDriverProbeTests.cs | 163 ++++++++++++++++++ 2 files changed, 286 insertions(+), 9 deletions(-) create mode 100644 tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs index 49d0d31d..726b88a1 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriverProbe.cs @@ -2,18 +2,32 @@ using System.Diagnostics; using System.Net.Sockets; using System.Text.Json; using System.Text.Json.Serialization; +using Grpc.Core; +using ZB.MOM.WW.MxGateway.Client; using ZB.MOM.WW.OtOpcUa.Core.Abstractions; using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config; namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy; /// -/// Cheap TCP-connect probe for the -shaped driver config. -/// Parses the Gateway.Endpoint gRPC endpoint (e.g. http://host:5001 or -/// host:5001), opens a socket and closes immediately. Surfaces a green tick + -/// latency on success; red chip + SocketError on failure; "timed out" on the caller's -/// cancellation. Does NOT exchange any gRPC frames — a richer gRPC ping probe is a -/// documented follow-up. +/// Probe for the -shaped driver config. Parses the +/// Gateway.Endpoint gRPC endpoint (e.g. http://host:5120 or host:5120), +/// does a fast TCP-connect preflight to fail closed ports quickly, then issues a lightweight, +/// read-only gRPC ping — the GalaxyRepository.TestConnection unary RPC — against the +/// mxaccessgw to confirm the remote is actually a live gateway speaking gRPC. The channel is +/// built the same way the driver builds it (cleartext for http://, TLS for +/// https://), and the result is classified by the gRPC : +/// +/// OK → reachable, gateway confirmed. +/// Unauthenticated / PermissionDenied → ALSO reachable: an auth +/// rejection proves a live gateway gRPC server answered. The probe deliberately does +/// not resolve secrets — it sends whatever API-key string is in the config +/// (possibly an unresolved env:/file: ref or empty), so authentication is +/// expected to be rejected and that rejection is the positive signal. +/// Unavailable / transport error / DeadlineExceeded → handshake +/// failed (the port answered TCP but did not complete a gRPC handshake). +/// +/// The ping is strictly read-only and never mutates gateway state. /// public sealed class GalaxyDriverProbe : IDriverProbe { @@ -39,13 +53,12 @@ public sealed class GalaxyDriverProbe : IDriverProbe if (string.IsNullOrWhiteSpace(host) || port <= 0) return new(false, "Config has no host/port to probe.", null); + // --- TCP preflight: fast-fail for closed ports / unreachable hosts --- var sw = Stopwatch.StartNew(); try { using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); await socket.ConnectAsync(host, port, ct); - sw.Stop(); - return new(true, null, sw.Elapsed); } catch (SocketException ex) { @@ -59,6 +72,107 @@ public sealed class GalaxyDriverProbe : IDriverProbe { return new(false, ex.Message, null); } + + // --- gRPC ping: confirm the port is a live mxaccessgw speaking gRPC --- + // TestConnection is a read-only metadata RPC (it asks the gateway whether it can reach + // the Galaxy Repository SQL Server). We don't care about its boolean result — only that + // the gateway answered the gRPC call. A successful return ⇒ OK; an auth rejection still + // proves a live gateway; a transport failure ⇒ the port isn't a gateway. We never resolve + // the API-key secret ref here (the host owns secret resolution): whatever string is in the + // config is sent as-is, so an unresolved/empty key surfaces as an auth rejection = reachable. + GalaxyRepositoryClient? client = null; + try + { + client = GalaxyRepositoryClient.Create(BuildProbeClientOptions(opts.Gateway, timeout)); + + using var deadlineCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + deadlineCts.CancelAfter(timeout); + + await client.TestConnectionAsync(deadlineCts.Token).ConfigureAwait(false); + sw.Stop(); + + // No RpcException ⇒ the gateway answered the gRPC call successfully. + var (ok, message) = ClassifyRpc(StatusCode.OK, host, port); + return new(ok, message, sw.Elapsed); + } + catch (RpcException ex) + { + sw.Stop(); + var (ok, message) = ClassifyRpc(ex.StatusCode, host, port); + return new(ok, message, ok ? sw.Elapsed : null); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + // The caller cancelled (their own timeout / shutdown) — surface a timeout message. + return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null); + } + catch (OperationCanceledException) + { + // Our own per-call deadline fired before the gateway answered — treat as a failed + // handshake (the port answered TCP but didn't complete a gRPC handshake in time). + var (_, message) = ClassifyRpc(StatusCode.DeadlineExceeded, host, port); + return new(false, message, null); + } + catch (Exception ex) + { + // Transport / channel construction failures (e.g. an https endpoint mismatch, a TLS + // negotiation failure, or a non-gRPC server) — reachable on TCP but not a gateway. + return new(false, + $"Reachable at {host}:{port} but gateway gRPC handshake failed: {ex.Message}", + null); + } + finally + { + if (client is not null) + await client.DisposeAsync().ConfigureAwait(false); + } + } + + /// + /// Maps a gRPC from the ping RPC to a probe outcome. Factored out + /// as a pure helper so the classification is unit-testable without a live gateway. An + /// auth rejection ( / + /// ) counts as reachable because it proves a live + /// gateway gRPC server answered. + /// + internal static (bool ok, string message) ClassifyRpc(StatusCode code, string host, int port) => code switch + { + StatusCode.OK => (true, "gateway gRPC OK"), + StatusCode.Unauthenticated or StatusCode.PermissionDenied => + (true, "gateway reachable & speaking gRPC (auth not checked)"), + _ => (false, $"Reachable at {host}:{port} but gateway gRPC handshake failed: {code}"), + }; + + /// + /// Builds the gRPC client options for the probe ping from the gateway config WITHOUT + /// resolving the API-key secret ref (the host owns secret resolution; the probe sends the + /// raw config string). Mirrors the driver's channel build — UseTls selects TLS vs + /// cleartext, and the CA path is honoured for TLS — but caps the call/connect timeouts to + /// the probe budget and uses a single attempt so an unreachable host fails fast. + /// + private static MxGatewayClientOptions BuildProbeClientOptions(GalaxyGatewayOptions gw, TimeSpan timeout) + { + // The gw client's Validate() rejects an empty API key. The config's secret ref may be an + // unresolved env:/file: string (non-empty → passes) or, in degenerate configs, empty. Send + // a non-empty placeholder only when the config carries no key string at all, so the gateway + // can still answer (and reject) the call. We never resolve env:/file: refs to real secrets. + var apiKey = string.IsNullOrWhiteSpace(gw.ApiKeySecretRef) ? "probe" : gw.ApiKeySecretRef; + + // Keep both timeouts inside the probe budget so the ping fails fast on an unreachable host. + var budget = timeout > TimeSpan.Zero ? timeout : TimeSpan.FromSeconds(1); + + return new MxGatewayClientOptions + { + Endpoint = new Uri(gw.Endpoint, UriKind.Absolute), + ApiKey = apiKey, + UseTls = gw.UseTls, + CaCertificatePath = gw.CaCertificatePath, + ConnectTimeout = budget, + DefaultCallTimeout = budget, + // One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded) + // retries; the linked deadline above bounds the whole call regardless. + Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 }, + }; } private static (string host, int port) ExtractTarget(GalaxyDriverOptions opts) @@ -66,7 +180,7 @@ public sealed class GalaxyDriverProbe : IDriverProbe var endpoint = opts.Gateway.Endpoint; if (string.IsNullOrWhiteSpace(endpoint)) return (string.Empty, 0); - // Try absolute URI first (e.g. "http://hostname:5001" or "https://hostname:5001"). + // Try absolute URI first (e.g. "http://hostname:5120" or "https://hostname:5120"). if (Uri.TryCreate(endpoint, UriKind.Absolute, out var uri)) { var host = uri.Host; diff --git a/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs new file mode 100644 index 00000000..48d7cd2f --- /dev/null +++ b/tests/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests/GalaxyDriverProbeTests.cs @@ -0,0 +1,163 @@ +using System.Net; +using System.Net.Sockets; +using Grpc.Core; +using Shouldly; +using Xunit; + +namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests; + +/// +/// Unit tests for . The probe does a TCP preflight followed by +/// a lightweight gRPC ping (the read-only GalaxyRepository.TestConnection RPC) against +/// the mxaccessgw, classifying the resulting . An auth rejection +/// ( / ) +/// counts as reachable because it proves a live gRPC server answered. The +/// mapping is unit-tested directly here without a +/// live gateway; the happy OK + auth-rejected paths are verified live against the real +/// gateway (10.100.0.48:5120). +/// +[Trait("Category", "Unit")] +public sealed class GalaxyDriverProbeTests +{ + private readonly GalaxyDriverProbe _probe = new(); + + // A complete-enough GalaxyDriverOptions JSON skeleton; the {0} placeholder is the + // gateway endpoint. All other nested records carry their required positional members. + private static string ConfigJson(string endpoint, bool useTls = false) => + $$""" + { + "Gateway": { + "Endpoint": "{{endpoint}}", + "ApiKeySecretRef": "env:GALAXY_MXGW_API_KEY", + "UseTls": {{(useTls ? "true" : "false")}} + }, + "MxAccess": { "ClientName": "probe-test" }, + "Repository": {}, + "Reconnect": {} + } + """; + + // ── 1. Invalid JSON ────────────────────────────────────────────────────────── + + /// Invalid JSON returns Ok=false with a message containing "invalid". + [Fact] + public async Task InvalidJson_returns_false_with_invalid_message() + { + var result = await _probe.ProbeAsync( + "not-json", + TimeSpan.FromSeconds(2), + TestContext.Current.CancellationToken); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldContain("invalid", Case.Insensitive); + result.Latency.ShouldBeNull(); + } + + // ── 2. Config with no endpoint ─────────────────────────────────────────────── + + /// + /// Config JSON whose Gateway.Endpoint is empty returns Ok=false with a message + /// indicating no host/port was found. + /// + [Fact] + public async Task NoEndpoint_returns_false_with_no_host_port_message() + { + var result = await _probe.ProbeAsync( + ConfigJson(endpoint: ""), + TimeSpan.FromSeconds(2), + TestContext.Current.CancellationToken); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldContain("no host", Case.Insensitive); + result.Latency.ShouldBeNull(); + } + + // ── 3. Unreachable closed port ──────────────────────────────────────────────── + + /// + /// Pointing at a TCP port that is not open returns Ok=false. Depending on how far the + /// probe gets, the message is the TCP "Connect failed", a gRPC "handshake failed", or a + /// "timed out" message — all of which are acceptable failure signals for an unreachable + /// gateway. + /// + [Fact] + public async Task ClosedPort_returns_false() + { + // Bind a port, capture it, then release it so nothing is listening when we probe. + var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + var port = ((IPEndPoint)listener.LocalEndpoint).Port; + listener.Stop(); + + var endpoint = $"http://127.0.0.1:{port}"; + + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + var result = await _probe.ProbeAsync( + ConfigJson(endpoint), TimeSpan.FromSeconds(3), cts.Token); + + result.Ok.ShouldBeFalse(); + result.Message.ShouldNotBeNull(); + result.Message!.ShouldSatisfyAllConditions( + () => (result.Message.Contains("Connect failed", StringComparison.OrdinalIgnoreCase) + || result.Message.Contains("handshake failed", StringComparison.OrdinalIgnoreCase) + || result.Message.Contains("timed out", StringComparison.OrdinalIgnoreCase)) + .ShouldBeTrue($"unexpected message: '{result.Message}'")); + } + + // ── 4. ClassifyRpc mapping (pure, no live gateway) ─────────────────────────── + + /// A gRPC OK status maps to a reachable, gateway-confirmed result. + [Fact] + public void ClassifyRpc_Ok_is_reachable() + { + var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.OK, "host", 5120); + ok.ShouldBeTrue(); + message.ShouldBe("gateway gRPC OK"); + } + + /// + /// A gRPC Unauthenticated status counts as reachable — an auth rejection proves a + /// live gateway gRPC server answered. + /// + [Fact] + public void ClassifyRpc_Unauthenticated_is_reachable_auth_not_checked() + { + var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unauthenticated, "host", 5120); + ok.ShouldBeTrue(); + message.ShouldContain("auth not checked", Case.Insensitive); + } + + /// A gRPC PermissionDenied status also counts as reachable. + [Fact] + public void ClassifyRpc_PermissionDenied_is_reachable_auth_not_checked() + { + var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.PermissionDenied, "host", 5120); + ok.ShouldBeTrue(); + message.ShouldContain("auth not checked", Case.Insensitive); + } + + /// + /// A gRPC Unavailable status is a transport failure — Ok=false with a message that + /// names the host, port and the failing status code. + /// + [Fact] + public void ClassifyRpc_Unavailable_is_not_reachable() + { + var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unavailable, "host", 5120); + ok.ShouldBeFalse(); + message.ShouldContain("Unavailable"); + message.ShouldContain("host:5120"); + } + + /// A gRPC DeadlineExceeded status is likewise a handshake failure. + [Fact] + public void ClassifyRpc_DeadlineExceeded_is_not_reachable() + { + var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.DeadlineExceeded, "host", 5120); + ok.ShouldBeFalse(); + message.ShouldContain("DeadlineExceeded"); + message.ShouldContain("host:5120"); + } +}