feat(probe): Galaxy Test-Connect does a gRPC ping (auth-rejection counts as reachable)
This commit is contained in:
@@ -2,18 +2,32 @@ using System.Diagnostics;
|
||||
using System.Net.Sockets;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Grpc.Core;
|
||||
using ZB.MOM.WW.MxGateway.Client;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
|
||||
|
||||
/// <summary>
|
||||
/// Cheap TCP-connect probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config.
|
||||
/// Parses the <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5001</c> or
|
||||
/// <c>host:5001</c>), opens a socket and closes immediately. Surfaces a green tick +
|
||||
/// latency on success; red chip + SocketError on failure; "timed out" on the caller's
|
||||
/// cancellation. Does NOT exchange any gRPC frames — a richer gRPC ping probe is a
|
||||
/// documented follow-up.
|
||||
/// Probe for the <see cref="GalaxyDriverOptions"/>-shaped driver config. Parses the
|
||||
/// <c>Gateway.Endpoint</c> gRPC endpoint (e.g. <c>http://host:5120</c> or <c>host:5120</c>),
|
||||
/// does a fast TCP-connect preflight to fail closed ports quickly, then issues a lightweight,
|
||||
/// read-only gRPC ping — the <c>GalaxyRepository.TestConnection</c> unary RPC — against the
|
||||
/// mxaccessgw to confirm the remote is actually a live gateway speaking gRPC. The channel is
|
||||
/// built the same way the driver builds it (cleartext for <c>http://</c>, TLS for
|
||||
/// <c>https://</c>), and the result is classified by the gRPC <see cref="StatusCode"/>:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>OK</c> → reachable, gateway confirmed.</item>
|
||||
/// <item><c>Unauthenticated</c> / <c>PermissionDenied</c> → ALSO reachable: an auth
|
||||
/// rejection proves a live gateway gRPC server answered. The probe deliberately does
|
||||
/// <b>not</b> resolve secrets — it sends whatever API-key string is in the config
|
||||
/// (possibly an unresolved <c>env:</c>/<c>file:</c> ref or empty), so authentication is
|
||||
/// expected to be rejected and that rejection is the positive signal.</item>
|
||||
/// <item><c>Unavailable</c> / transport error / <c>DeadlineExceeded</c> → handshake
|
||||
/// failed (the port answered TCP but did not complete a gRPC handshake).</item>
|
||||
/// </list>
|
||||
/// The ping is strictly read-only and never mutates gateway state.
|
||||
/// </summary>
|
||||
public sealed class GalaxyDriverProbe : IDriverProbe
|
||||
{
|
||||
@@ -39,13 +53,12 @@ public sealed class GalaxyDriverProbe : IDriverProbe
|
||||
if (string.IsNullOrWhiteSpace(host) || port <= 0)
|
||||
return new(false, "Config has no host/port to probe.", null);
|
||||
|
||||
// --- TCP preflight: fast-fail for closed ports / unreachable hosts ---
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
|
||||
await socket.ConnectAsync(host, port, ct);
|
||||
sw.Stop();
|
||||
return new(true, null, sw.Elapsed);
|
||||
}
|
||||
catch (SocketException ex)
|
||||
{
|
||||
@@ -59,6 +72,107 @@ public sealed class GalaxyDriverProbe : IDriverProbe
|
||||
{
|
||||
return new(false, ex.Message, null);
|
||||
}
|
||||
|
||||
// --- gRPC ping: confirm the port is a live mxaccessgw speaking gRPC ---
|
||||
// TestConnection is a read-only metadata RPC (it asks the gateway whether it can reach
|
||||
// the Galaxy Repository SQL Server). We don't care about its boolean result — only that
|
||||
// the gateway answered the gRPC call. A successful return ⇒ OK; an auth rejection still
|
||||
// proves a live gateway; a transport failure ⇒ the port isn't a gateway. We never resolve
|
||||
// the API-key secret ref here (the host owns secret resolution): whatever string is in the
|
||||
// config is sent as-is, so an unresolved/empty key surfaces as an auth rejection = reachable.
|
||||
GalaxyRepositoryClient? client = null;
|
||||
try
|
||||
{
|
||||
client = GalaxyRepositoryClient.Create(BuildProbeClientOptions(opts.Gateway, timeout));
|
||||
|
||||
using var deadlineCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
deadlineCts.CancelAfter(timeout);
|
||||
|
||||
await client.TestConnectionAsync(deadlineCts.Token).ConfigureAwait(false);
|
||||
sw.Stop();
|
||||
|
||||
// No RpcException ⇒ the gateway answered the gRPC call successfully.
|
||||
var (ok, message) = ClassifyRpc(StatusCode.OK, host, port);
|
||||
return new(ok, message, sw.Elapsed);
|
||||
}
|
||||
catch (RpcException ex)
|
||||
{
|
||||
sw.Stop();
|
||||
var (ok, message) = ClassifyRpc(ex.StatusCode, host, port);
|
||||
return new(ok, message, ok ? sw.Elapsed : null);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
// The caller cancelled (their own timeout / shutdown) — surface a timeout message.
|
||||
return new(false, $"Probe timed out after {timeout.TotalSeconds:F0}s.", null);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Our own per-call deadline fired before the gateway answered — treat as a failed
|
||||
// handshake (the port answered TCP but didn't complete a gRPC handshake in time).
|
||||
var (_, message) = ClassifyRpc(StatusCode.DeadlineExceeded, host, port);
|
||||
return new(false, message, null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Transport / channel construction failures (e.g. an https endpoint mismatch, a TLS
|
||||
// negotiation failure, or a non-gRPC server) — reachable on TCP but not a gateway.
|
||||
return new(false,
|
||||
$"Reachable at {host}:{port} but gateway gRPC handshake failed: {ex.Message}",
|
||||
null);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (client is not null)
|
||||
await client.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maps a gRPC <see cref="StatusCode"/> from the ping RPC to a probe outcome. Factored out
|
||||
/// as a pure helper so the classification is unit-testable without a live gateway. An
|
||||
/// auth rejection (<see cref="StatusCode.Unauthenticated"/> /
|
||||
/// <see cref="StatusCode.PermissionDenied"/>) counts as reachable because it proves a live
|
||||
/// gateway gRPC server answered.
|
||||
/// </summary>
|
||||
internal static (bool ok, string message) ClassifyRpc(StatusCode code, string host, int port) => code switch
|
||||
{
|
||||
StatusCode.OK => (true, "gateway gRPC OK"),
|
||||
StatusCode.Unauthenticated or StatusCode.PermissionDenied =>
|
||||
(true, "gateway reachable & speaking gRPC (auth not checked)"),
|
||||
_ => (false, $"Reachable at {host}:{port} but gateway gRPC handshake failed: {code}"),
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Builds the gRPC client options for the probe ping from the gateway config WITHOUT
|
||||
/// resolving the API-key secret ref (the host owns secret resolution; the probe sends the
|
||||
/// raw config string). Mirrors the driver's channel build — <c>UseTls</c> selects TLS vs
|
||||
/// cleartext, and the CA path is honoured for TLS — but caps the call/connect timeouts to
|
||||
/// the probe budget and uses a single attempt so an unreachable host fails fast.
|
||||
/// </summary>
|
||||
private static MxGatewayClientOptions BuildProbeClientOptions(GalaxyGatewayOptions gw, TimeSpan timeout)
|
||||
{
|
||||
// The gw client's Validate() rejects an empty API key. The config's secret ref may be an
|
||||
// unresolved env:/file: string (non-empty → passes) or, in degenerate configs, empty. Send
|
||||
// a non-empty placeholder only when the config carries no key string at all, so the gateway
|
||||
// can still answer (and reject) the call. We never resolve env:/file: refs to real secrets.
|
||||
var apiKey = string.IsNullOrWhiteSpace(gw.ApiKeySecretRef) ? "probe" : gw.ApiKeySecretRef;
|
||||
|
||||
// Keep both timeouts inside the probe budget so the ping fails fast on an unreachable host.
|
||||
var budget = timeout > TimeSpan.Zero ? timeout : TimeSpan.FromSeconds(1);
|
||||
|
||||
return new MxGatewayClientOptions
|
||||
{
|
||||
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
|
||||
ApiKey = apiKey,
|
||||
UseTls = gw.UseTls,
|
||||
CaCertificatePath = gw.CaCertificatePath,
|
||||
ConnectTimeout = budget,
|
||||
DefaultCallTimeout = budget,
|
||||
// One shot — the probe must not spin on transient (Unavailable/DeadlineExceeded)
|
||||
// retries; the linked deadline above bounds the whole call regardless.
|
||||
Retry = new MxGatewayClientRetryOptions { MaxAttempts = 1 },
|
||||
};
|
||||
}
|
||||
|
||||
private static (string host, int port) ExtractTarget(GalaxyDriverOptions opts)
|
||||
@@ -66,7 +180,7 @@ public sealed class GalaxyDriverProbe : IDriverProbe
|
||||
var endpoint = opts.Gateway.Endpoint;
|
||||
if (string.IsNullOrWhiteSpace(endpoint)) return (string.Empty, 0);
|
||||
|
||||
// Try absolute URI first (e.g. "http://hostname:5001" or "https://hostname:5001").
|
||||
// Try absolute URI first (e.g. "http://hostname:5120" or "https://hostname:5120").
|
||||
if (Uri.TryCreate(endpoint, UriKind.Absolute, out var uri))
|
||||
{
|
||||
var host = uri.Host;
|
||||
|
||||
Reference in New Issue
Block a user