feat(probe): Galaxy Test-Connect does a gRPC ping (auth-rejection counts as reachable)

This commit is contained in:
Joseph Doherty
2026-06-16 06:48:40 -04:00
parent b663ae6eff
commit 2d688c2a6d
2 changed files with 286 additions and 9 deletions
@@ -2,18 +2,32 @@ using System.Diagnostics;
using System.Net.Sockets;
using System.Text.Json;
using System.Text.Json.Serialization;
using Grpc.Core;
using ZB.MOM.WW.MxGateway.Client;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// Cheap TCP-connect probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config.
/// Parses the <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5001</c> or
/// <c>host:5001</c>), opens a socket and closes immediately. Surfaces a green tick +
/// latency on success; red chip + SocketError on failure; "timed out" on the caller's
/// cancellation. Does NOT exchange any gRPC frames — a richer gRPC ping probe is a
/// documented follow-up.
/// Probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config. Parses the
/// <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5120</c> or <c>host:5120</c>),
/// does a fast TCP-connect preflight to fail closed ports quickly, then issues a lightweight,
/// read-only gRPC ping — the <c>GalaxyRepository.TestConnection</c> unary RPC — against the
/// mxaccessgw to confirm the remote is actually a live gateway speaking gRPC. The channel is
/// built the same way the driver builds it (cleartext for <c>http://</c>, TLS for
/// <c>https://</c>), and the result is classified by the gRPC <see cref="StatusCode"/>:
/// <list type="bullet">
/// <item><c>OK</c> → reachable, gateway confirmed.</item>
/// <item><c>Unauthenticated</c> / <c>PermissionDenied</c> → ALSO reachable: an auth
/// rejection proves a live gateway gRPC server answered. The probe deliberately does
/// <b>not</b> resolve secrets — it sends whatever API-key string is in the config
/// (possibly an unresolved <c>env:</c>/<c>file:</c> ref or empty), so authentication is
/// expected to be rejected and that rejection is the positive signal.</item>
/// <item><c>Unavailable</c> / transport error / <c>DeadlineExceeded</c> → handshake
/// failed (the port answered TCP but did not complete a gRPC handshake).</item>
/// </list>
/// The ping is strictly read-only and never mutates gateway state.
/// </summary>
public sealed class GalaxyDriverProbe : IDriverProbe
{
@@ -39,13 +53,12 @@ public sealed class GalaxyDriverProbe : IDriverProbe
if (string.IsNullOrWhiteSpace(host) || port <= 0)
return new(false, "Config has no host/port to probe.", null);
// --- TCP preflight: fast-fail for closed ports / unreachable hosts ---
var sw = Stopwatch.StartNew();
try
{
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
await socket.ConnectAsync(host, port, ct);
sw.Stop();
return new(true, null, sw.Elapsed);
}
catch (SocketException ex)
{
@@ -59,6 +72,107 @@ public sealed class GalaxyDriverProbe : IDriverProbe
{
return new(false, ex.Message, null);
}
// --- gRPC ping: confirm the port is a live mxaccessgw speaking gRPC ---
// TestConnection is a read-only metadata RPC (it asks the gateway whether it can reach
// the Galaxy Repository SQL Server). We don't care about its boolean result — only that
// the gateway answered the gRPC call. A successful return ⇒ OK; an auth rejection still
// proves a live gateway; a transport failure ⇒ the port isn't a gateway. We never resolve
// the API-key secret ref here (the host owns secret resolution): whatever string is in the
// config is sent as-is, so an unresolved/empty key surfaces as an auth rejection = reachable.
GalaxyRepositoryClient? client = null;
try
{
client = GalaxyRepositoryClient.Create(BuildProbeClientOptions(opts.Gateway, timeout));
using var deadlineCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
deadlineCts.CancelAfter(timeout);
await client.TestConnectionAsync(deadlineCts.Token).ConfigureAwait(false);
sw.Stop();
// No RpcException ⇒ the gateway answered the gRPC call successfully.
var (ok, message) = ClassifyRpc(StatusCode.OK, host, port);
return new(ok, message, sw.Elapsed);
}
catch (RpcException ex)
{
sw.Stop();
var (ok, message) = ClassifyRpc(ex.StatusCode, host, port);
return new(ok, message, ok ? sw.Elapsed : null);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// The caller cancelled (their own timeout / shutdown) — surface a timeout message.
return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
}
catch (OperationCanceledException)
{
// Our own per-call deadline fired before the gateway answered — treat as a failed
// handshake (the port answered TCP but didn't complete a gRPC handshake in time).
var (_, message) = ClassifyRpc(StatusCode.DeadlineExceeded, host, port);
return new(false, message, null);
}
catch (Exception ex)
{
// Transport / channel construction failures (e.g. an https endpoint mismatch, a TLS
// negotiation failure, or a non-gRPC server) — reachable on TCP but not a gateway.
return new(false,
$"Reachable at {host}:{port} but gateway gRPC handshake failed: {ex.Message}",
null);
}
finally
{
if (client is not null)
await client.DisposeAsync().ConfigureAwait(false);
}
}
/// <summary>
/// Maps a gRPC <see cref="StatusCode"/> from the ping RPC to a probe outcome. Factored out
/// as a pure helper so the classification is unit-testable without a live gateway. An
/// auth rejection (<see cref="StatusCode.Unauthenticated"/> /
/// <see cref="StatusCode.PermissionDenied"/>) counts as reachable because it proves a live
/// gateway gRPC server answered.
/// </summary>
internal static (bool ok, string message) ClassifyRpc(StatusCode code, string host, int port) => code switch
{
StatusCode.OK => (true, "gateway gRPC OK"),
StatusCode.Unauthenticated or StatusCode.PermissionDenied =>
(true, "gateway reachable & speaking gRPC (auth not checked)"),
_ => (false, $"Reachable at {host}:{port} but gateway gRPC handshake failed: {code}"),
};
/// <summary>
/// Builds the gRPC client options for the probe ping from the gateway config WITHOUT
/// resolving the API-key secret ref (the host owns secret resolution; the probe sends the
/// raw config string). Mirrors the driver's channel build — <c>UseTls</c> selects TLS vs
/// cleartext, and the CA path is honoured for TLS — but caps the call/connect timeouts to
/// the probe budget and uses a single attempt so an unreachable host fails fast.
/// </summary>
private static MxGatewayClientOptions BuildProbeClientOptions(GalaxyGatewayOptions gw, TimeSpan timeout)
{
// The gw client's Validate() rejects an empty API key. The config's secret ref may be an
// unresolved env:/file: string (non-empty → passes) or, in degenerate configs, empty. Send
// a non-empty placeholder only when the config carries no key string at all, so the gateway
// can still answer (and reject) the call. We never resolve env:/file: refs to real secrets.
var apiKey = string.IsNullOrWhiteSpace(gw.ApiKeySecretRef) ? "probe" : gw.ApiKeySecretRef;
// Keep both timeouts inside the probe budget so the ping fails fast on an unreachable host.
var budget = timeout > TimeSpan.Zero ? timeout : TimeSpan.FromSeconds(1);
return new MxGatewayClientOptions
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = apiKey,
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = budget,
DefaultCallTimeout = budget,
// One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded)
// retries; the linked deadline above bounds the whole call regardless.
Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 },
};
}
private static (string host, int port) ExtractTarget(GalaxyDriverOptions opts)
@@ -66,7 +180,7 @@ public sealed class GalaxyDriverProbe : IDriverProbe
var endpoint = opts.Gateway.Endpoint;
if (string.IsNullOrWhiteSpace(endpoint)) return (string.Empty, 0);
// Try absolute URI first (e.g. "http://hostname:5001" or "https://hostname:5001").
// Try absolute URI first (e.g. "http://hostname:5120" or "https://hostname:5120").
if (Uri.TryCreate(endpoint, UriKind.Absolute, out var uri))
{
var host = uri.Host;
@@ -0,0 +1,163 @@
using System.Net;
using System.Net.Sockets;
using Grpc.Core;
using Shouldly;
using Xunit;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests;
/// <summary>
/// Unit tests for <see cref="GalaxyDriverProbe"/>. The probe does a TCP preflight followed by
/// a lightweight gRPC ping (the read-only <c>GalaxyRepository.TestConnection</c> RPC) against
/// the mxaccessgw, classifying the resulting <see cref="StatusCode"/>. An auth rejection
/// (<see cref="StatusCode.Unauthenticated"/> / <see cref="StatusCode.PermissionDenied"/>)
/// counts as reachable because it proves a live gRPC server answered. The
/// <see cref="GalaxyDriverProbe.ClassifyRpc"/> mapping is unit-tested directly here without a
/// live gateway; the happy <c>OK</c> + auth-rejected paths are verified live against the real
/// gateway (10.100.0.48:5120).
/// </summary>
[Trait("Category", "Unit")]
public sealed class GalaxyDriverProbeTests
{
private readonly GalaxyDriverProbe _probe = new();
// A complete-enough GalaxyDriverOptions JSON skeleton; the {0} placeholder is the
// gateway endpoint. All other nested records carry their required positional members.
private static string ConfigJson(string endpoint, bool useTls = false) =>
$$"""
{
"Gateway": {
"Endpoint": "{{endpoint}}",
"ApiKeySecretRef": "env:GALAXY_MXGW_API_KEY",
"UseTls": {{(useTls ? "true" : "false")}}
},
"MxAccess": { "ClientName": "probe-test" },
"Repository": {},
"Reconnect": {}
}
""";
// ── 1. Invalid JSON ──────────────────────────────────────────────────────────
/// <summary>Invalid JSON returns Ok=false with a message containing "invalid".</summary>
[Fact]
public async Task InvalidJson_returns_false_with_invalid_message()
{
var result = await _probe.ProbeAsync(
"not-json",
TimeSpan.FromSeconds(2),
TestContext.Current.CancellationToken);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ShouldContain("invalid", Case.Insensitive);
result.Latency.ShouldBeNull();
}
// ── 2. Config with no endpoint ───────────────────────────────────────────────
/// <summary>
/// Config JSON whose Gateway.Endpoint is empty returns Ok=false with a message
/// indicating no host/port was found.
/// </summary>
[Fact]
public async Task NoEndpoint_returns_false_with_no_host_port_message()
{
var result = await _probe.ProbeAsync(
ConfigJson(endpoint: ""),
TimeSpan.FromSeconds(2),
TestContext.Current.CancellationToken);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ShouldContain("no host", Case.Insensitive);
result.Latency.ShouldBeNull();
}
// ── 3. Unreachable closed port ────────────────────────────────────────────────
/// <summary>
/// Pointing at a TCP port that is not open returns Ok=false. Depending on how far the
/// probe gets, the message is the TCP "Connect failed", a gRPC "handshake failed", or a
/// "timed out" message — all of which are acceptable failure signals for an unreachable
/// gateway.
/// </summary>
[Fact]
public async Task ClosedPort_returns_false()
{
// Bind a port, capture it, then release it so nothing is listening when we probe.
var listener = new TcpListener(IPAddress.Loopback, 0);
listener.Start();
var port = ((IPEndPoint)listener.LocalEndpoint).Port;
listener.Stop();
var endpoint = $"http://127.0.0.1:{port}";
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
var result = await _probe.ProbeAsync(
ConfigJson(endpoint), TimeSpan.FromSeconds(3), cts.Token);
result.Ok.ShouldBeFalse();
result.Message.ShouldNotBeNull();
result.Message!.ShouldSatisfyAllConditions(
() => (result.Message.Contains("Connect failed", StringComparison.OrdinalIgnoreCase)
|| result.Message.Contains("handshake failed", StringComparison.OrdinalIgnoreCase)
|| result.Message.Contains("timed out", StringComparison.OrdinalIgnoreCase))
.ShouldBeTrue($"unexpected message: '{result.Message}'"));
}
// ── 4. ClassifyRpc mapping (pure, no live gateway) ───────────────────────────
/// <summary>A gRPC <c>OK</c> status maps to a reachable, gateway-confirmed result.</summary>
[Fact]
public void ClassifyRpc_Ok_is_reachable()
{
var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.OK, "host", 5120);
ok.ShouldBeTrue();
message.ShouldBe("gateway gRPC OK");
}
/// <summary>
/// A gRPC <c>Unauthenticated</c> status counts as reachable — an auth rejection proves a
/// live gateway gRPC server answered.
/// </summary>
[Fact]
public void ClassifyRpc_Unauthenticated_is_reachable_auth_not_checked()
{
var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unauthenticated, "host", 5120);
ok.ShouldBeTrue();
message.ShouldContain("auth not checked", Case.Insensitive);
}
/// <summary>A gRPC <c>PermissionDenied</c> status also counts as reachable.</summary>
[Fact]
public void ClassifyRpc_PermissionDenied_is_reachable_auth_not_checked()
{
var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.PermissionDenied, "host", 5120);
ok.ShouldBeTrue();
message.ShouldContain("auth not checked", Case.Insensitive);
}
/// <summary>
/// A gRPC <c>Unavailable</c> status is a transport failure — Ok=false with a message that
/// names the host, port and the failing status code.
/// </summary>
[Fact]
public void ClassifyRpc_Unavailable_is_not_reachable()
{
var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.Unavailable, "host", 5120);
ok.ShouldBeFalse();
message.ShouldContain("Unavailable");
message.ShouldContain("host:5120");
}
/// <summary>A gRPC <c>DeadlineExceeded</c> status is likewise a handshake failure.</summary>
[Fact]
public void ClassifyRpc_DeadlineExceeded_is_not_reachable()
{
var (ok, message) = GalaxyDriverProbe.ClassifyRpc(StatusCode.DeadlineExceeded, "host", 5120);
ok.ShouldBeFalse();
message.ShouldContain("DeadlineExceeded");
message.ShouldContain("host:5120");
}
}