feat(probe): Galaxy Test-Connect does a gRPC ping (auth-rejection counts as reachable)

This commit is contained in:
Joseph Doherty
2026-06-16 06:48:40 -04:00
parent b663ae6eff
commit 2d688c2a6d
2 changed files with 286 additions and 9 deletions
@@ -2,18 +2,32 @@ using System.Diagnostics;
using System.Net.Sockets;
using System.Text.Json;
using System.Text.Json.Serialization;
using Grpc.Core;
using ZB.MOM.WW.MxGateway.Client;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// Cheap TCP-connect probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config.
/// Parses the <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5001</c> or
/// <c>host:5001</c>), opens a socket and closes immediately. Surfaces a green tick +
/// latency on success; red chip + SocketError on failure; "timed out" on the caller's
/// cancellation. Does NOT exchange any gRPC frames — a richer gRPC ping probe is a
/// documented follow-up.
/// Probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config. Parses the
/// <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5120</c> or <c>host:5120</c>),
/// does a fast TCP-connect preflight to fail closed ports quickly, then issues a lightweight,
/// read-only gRPC ping — the <c>GalaxyRepository.TestConnection</c> unary RPC — against the
/// mxaccessgw to confirm the remote is actually a live gateway speaking gRPC. The channel is
/// built the same way the driver builds it (cleartext for <c>http://</c>, TLS for
/// <c>https://</c>), and the result is classified by the gRPC <see cref="StatusCode"/>:
/// <list type="bullet">
/// <item><c>OK</c> → reachable, gateway confirmed.</item>
/// <item><c>Unauthenticated</c> / <c>PermissionDenied</c> → ALSO reachable: an auth
/// rejection proves a live gateway gRPC server answered. The probe deliberately does
/// <b>not</b> resolve secrets — it sends whatever API-key string is in the config
/// (possibly an unresolved <c>env:</c>/<c>file:</c> ref or empty), so authentication is
/// expected to be rejected and that rejection is the positive signal.</item>
/// <item><c>Unavailable</c> / transport error / <c>DeadlineExceeded</c> → handshake
/// failed (the port answered TCP but did not complete a gRPC handshake).</item>
/// </list>
/// The ping is strictly read-only and never mutates gateway state.
/// </summary>
public sealed class GalaxyDriverProbe : IDriverProbe
{
@@ -39,13 +53,12 @@ public sealed class GalaxyDriverProbe : IDriverProbe
if (string.IsNullOrWhiteSpace(host) || port <= 0)
return new(false, "Config has no host/port to probe.", null);
// --- TCP preflight: fast-fail for closed ports / unreachable hosts ---
var sw = Stopwatch.StartNew();
try
{
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
await socket.ConnectAsync(host, port, ct);
sw.Stop();
return new(true, null, sw.Elapsed);
}
catch (SocketException ex)
{
@@ -59,6 +72,107 @@ public sealed class GalaxyDriverProbe : IDriverProbe
{
return new(false, ex.Message, null);
}
// --- gRPC ping: confirm the port is a live mxaccessgw speaking gRPC ---
// TestConnection is a read-only metadata RPC (it asks the gateway whether it can reach
// the Galaxy Repository SQL Server). We don't care about its boolean result — only that
// the gateway answered the gRPC call. A successful return ⇒ OK; an auth rejection still
// proves a live gateway; a transport failure ⇒ the port isn't a gateway. We never resolve
// the API-key secret ref here (the host owns secret resolution): whatever string is in the
// config is sent as-is, so an unresolved/empty key surfaces as an auth rejection = reachable.
GalaxyRepositoryClient? client = null;
try
{
client = GalaxyRepositoryClient.Create(BuildProbeClientOptions(opts.Gateway, timeout));
using var deadlineCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
deadlineCts.CancelAfter(timeout);
await client.TestConnectionAsync(deadlineCts.Token).ConfigureAwait(false);
sw.Stop();
// No RpcException ⇒ the gateway answered the gRPC call successfully.
var (ok, message) = ClassifyRpc(StatusCode.OK, host, port);
return new(ok, message, sw.Elapsed);
}
catch (RpcException ex)
{
sw.Stop();
var (ok, message) = ClassifyRpc(ex.StatusCode, host, port);
return new(ok, message, ok ? sw.Elapsed : null);
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// The caller cancelled (their own timeout / shutdown) — surface a timeout message.
return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
}
catch (OperationCanceledException)
{
// Our own per-call deadline fired before the gateway answered — treat as a failed
// handshake (the port answered TCP but didn't complete a gRPC handshake in time).
var (_, message) = ClassifyRpc(StatusCode.DeadlineExceeded, host, port);
return new(false, message, null);
}
catch (Exception ex)
{
// Transport / channel construction failures (e.g. an https endpoint mismatch, a TLS
// negotiation failure, or a non-gRPC server) — reachable on TCP but not a gateway.
return new(false,
$"Reachable at {host}:{port} but gateway gRPC handshake failed: {ex.Message}",
null);
}
finally
{
if (client is not null)
await client.DisposeAsync().ConfigureAwait(false);
}
}
/// <summary>
/// Maps a gRPC <see cref="StatusCode"/> from the ping RPC to a probe outcome. Factored out
/// as a pure helper so the classification is unit-testable without a live gateway. An
/// auth rejection (<see cref="StatusCode.Unauthenticated"/> /
/// <see cref="StatusCode.PermissionDenied"/>) counts as reachable because it proves a live
/// gateway gRPC server answered.
/// </summary>
internal static (bool ok, string message) ClassifyRpc(StatusCode code, string host, int port) => code switch
{
StatusCode.OK => (true, "gateway gRPC OK"),
StatusCode.Unauthenticated or StatusCode.PermissionDenied =>
(true, "gateway reachable & speaking gRPC (auth not checked)"),
_ => (false, $"Reachable at {host}:{port} but gateway gRPC handshake failed: {code}"),
};
/// <summary>
/// Builds the gRPC client options for the probe ping from the gateway config WITHOUT
/// resolving the API-key secret ref (the host owns secret resolution; the probe sends the
/// raw config string). Mirrors the driver's channel build — <c>UseTls</c> selects TLS vs
/// cleartext, and the CA path is honoured for TLS — but caps the call/connect timeouts to
/// the probe budget and uses a single attempt so an unreachable host fails fast.
/// </summary>
private static MxGatewayClientOptions BuildProbeClientOptions(GalaxyGatewayOptions gw, TimeSpan timeout)
{
// The gw client's Validate() rejects an empty API key. The config's secret ref may be an
// unresolved env:/file: string (non-empty → passes) or, in degenerate configs, empty. Send
// a non-empty placeholder only when the config carries no key string at all, so the gateway
// can still answer (and reject) the call. We never resolve env:/file: refs to real secrets.
var apiKey = string.IsNullOrWhiteSpace(gw.ApiKeySecretRef) ? "probe" : gw.ApiKeySecretRef;
// Keep both timeouts inside the probe budget so the ping fails fast on an unreachable host.
var budget = timeout > TimeSpan.Zero ? timeout : TimeSpan.FromSeconds(1);
return new MxGatewayClientOptions
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = apiKey,
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = budget,
DefaultCallTimeout = budget,
// One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded)
// retries; the linked deadline above bounds the whole call regardless.
Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 },
};
}
private static (string host, int port) ExtractTarget(GalaxyDriverOptions opts)
@@ -66,7 +180,7 @@ public sealed class GalaxyDriverProbe : IDriverProbe
var endpoint = opts.Gateway.Endpoint;
if (string.IsNullOrWhiteSpace(endpoint)) return (string.Empty, 0);
// Try absolute URI first (e.g. "http://hostname:5001" or "https://hostname:5001").
// Try absolute URI first (e.g. "http://hostname:5120" or "https://hostname:5120").
if (Uri.TryCreate(endpoint, UriKind.Absolute, out var uri))
{
var host = uri.Host;