fix(client/python): reachable cert-validation flag; bounded off-loop TOFU probe; license/marker fixes (Client.Python-027..031)

This commit is contained in:
Joseph Doherty
2026-06-15 02:39:11 -04:00
parent d0d1dcef15
commit 47062c1a6e
11 changed files with 550 additions and 13 deletions
+12 -1
View File
@@ -238,7 +238,11 @@ left `False`, the client fetches the gateway's presented certificate once
to `localhost` (the generated certificate always carries a `localhost` SAN) when
none was supplied. To verify instead, pass `ca_file` to verify against a specific
CA, or set `require_certificate_validation=True` to verify against the system
trust roots. See
trust roots. The strict posture is reachable through every documented entry
point: the `require_certificate_validation=True` keyword on
`GatewayClient.connect(...)` / `GalaxyRepositoryClient.connect(...)`, the
`ClientOptions(require_certificate_validation=True)` struct, and the
`--require-certificate-validation` CLI flag. See
[Gateway Configuration](../../docs/GatewayConfiguration.md#automatic-self-signed-certificate).
## CLI
@@ -267,6 +271,13 @@ Use TLS options for a secured gateway:
mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --ca-file C:\certs\mxgateway-ca.pem --server-name-override mxgateway.example.local --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json
```
To force certificate validation against the system trust store instead of the
lenient trust-on-first-use default, add `--require-certificate-validation`:
```powershell
mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --require-certificate-validation --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json
```
## Integration Checks
Run live checks only when a gateway and MXAccess-backed worker are available:
+4 -2
View File
@@ -16,11 +16,10 @@ dependencies = [
authors = [
{ name = "Joseph Doherty" },
]
license = { text = "Proprietary" }
license = "LicenseRef-Proprietary"
keywords = ["mxaccess", "mxgateway", "grpc", "client", "archestra"]
classifiers = [
"Development Status :: 3 - Alpha",
"License :: Other/Proprietary License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
@@ -54,3 +53,6 @@ where = ["src"]
addopts = "-ra"
pythonpath = ["src"]
testpaths = ["tests"]
markers = [
"tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1",
]
@@ -40,6 +40,7 @@ class GatewayClient:
api_key: str | None = None,
plaintext: bool = False,
ca_file: str | None = None,
require_certificate_validation: bool = False,
server_name_override: str | None = None,
stub: Any | None = None,
) -> "GatewayClient":
@@ -50,13 +51,16 @@ class GatewayClient:
api_key=api_key,
plaintext=plaintext,
ca_file=ca_file,
require_certificate_validation=require_certificate_validation,
server_name_override=server_name_override,
)
if stub is not None:
return cls(options=resolved, stub=stub)
channel = create_channel(resolved)
# create_channel may perform a blocking TLS certificate probe (TOFU
# default); run it off the event loop so connect never freezes the loop.
channel = await asyncio.to_thread(create_channel, resolved)
return cls(
options=resolved,
stub=pb_grpc.MxAccessGatewayStub(channel),
@@ -52,6 +52,7 @@ class GalaxyRepositoryClient:
api_key: str | None = None,
plaintext: bool = False,
ca_file: str | None = None,
require_certificate_validation: bool = False,
server_name_override: str | None = None,
stub: Any | None = None,
) -> "GalaxyRepositoryClient":
@@ -62,13 +63,16 @@ class GalaxyRepositoryClient:
api_key=api_key,
plaintext=plaintext,
ca_file=ca_file,
require_certificate_validation=require_certificate_validation,
server_name_override=server_name_override,
)
if stub is not None:
return cls(options=resolved, stub=stub)
channel = create_channel(resolved)
# create_channel may perform a blocking TLS certificate probe (TOFU
# default); run it off the event loop so connect never freezes the loop.
channel = await asyncio.to_thread(create_channel, resolved)
return cls(
options=resolved,
stub=galaxy_pb_grpc.GalaxyRepositoryStub(channel),
@@ -12,6 +12,10 @@ import grpc
from .auth import REDACTED, ApiKey
from .errors import MxGatewayTransportError
# Fallback bound for the TOFU certificate probe when no call_timeout is set, so a
# black-holed host fails fast instead of hanging on the OS default connect timeout.
_TOFU_PROBE_TIMEOUT_SECONDS = 10.0
@dataclass(frozen=True)
class ClientOptions:
@@ -88,8 +92,17 @@ def _split_authority(endpoint: str) -> tuple[str, int]:
remainder = target[bracket_end + 1 :] # ":5120" or ""
port_str = remainder.lstrip(":")
return (host, int(port_str) if port_str else 443)
host, _, port = target.rpartition(":")
return (host or "localhost", int(port) if port else 443)
host, sep, port = target.rpartition(":")
if not sep:
# No colon at all (e.g. a bare hostname "mygateway"): the whole target
# is the host; default the port rather than raising on int("mygateway").
return (target or "localhost", 443)
if not port.isdigit():
# A colon with a non-numeric / empty tail (e.g. a trailing ":") is not
# an explicit port — keep the left side as the host and default the
# port so a typo cannot raise an uncaught ValueError on the TOFU path.
return (host or "localhost", 443)
return (host or "localhost", int(port))
def create_channel(options: ClientOptions) -> grpc.aio.Channel:
@@ -120,9 +133,15 @@ def create_channel(options: ClientOptions) -> grpc.aio.Channel:
else:
# Lenient default: grpc-python has no per-channel skip-verify, so fetch the
# server's certificate (unverified) and pin it for this channel (TOFU).
# The probe opens a real blocking TCP+TLS socket, so it MUST be bounded —
# a black-holed / firewall-drop host would otherwise hang on the OS default
# connect timeout (minutes). Bound it by call_timeout (or a short fixed
# fallback) so the dial fails fast as a transport error. The async
# `connect` classmethods run this off the event loop (asyncio.to_thread).
host, port = _split_authority(options.endpoint)
probe_timeout = options.call_timeout if options.call_timeout else _TOFU_PROBE_TIMEOUT_SECONDS
try:
presented = ssl.get_server_certificate((host, port))
presented = ssl.get_server_certificate((host, port), timeout=probe_timeout)
except OSError as error:
raise MxGatewayTransportError(
f"failed to fetch TLS certificate from {options.endpoint}: {error}"
@@ -170,6 +170,13 @@ def gateway_options(command: Callable[..., Any]) -> Callable[..., Any]:
command = click.option("--plaintext", is_flag=True, help="Use plaintext gRPC.")(command)
command = click.option("--tls", "use_tls", is_flag=True, help="Use TLS gRPC.")(command)
command = click.option("--ca-file", default=None, help="Custom root certificate file.")(command)
command = click.option(
"--require-certificate-validation",
"require_certificate_validation",
is_flag=True,
help="Verify the TLS certificate against the system trust store "
"instead of the lenient trust-on-first-use default.",
)(command)
command = click.option(
"--server-name-override",
default=None,
@@ -923,6 +930,7 @@ async def _connect(kwargs: dict[str, Any]) -> GatewayClient:
api_key=api_key,
plaintext=_use_plaintext(kwargs),
ca_file=kwargs.get("ca_file"),
require_certificate_validation=bool(kwargs.get("require_certificate_validation")),
server_name_override=kwargs.get("server_name_override"),
call_timeout=kwargs.get("call_timeout"),
stream_timeout=kwargs.get("stream_timeout"),
+50 -2
View File
@@ -1,9 +1,12 @@
"""Tests for auth metadata and connection options."""
import socket
import pytest
from zb_mom_ww_mxgateway.auth import REDACTED, ApiKey, auth_metadata, redact_secret
from zb_mom_ww_mxgateway import options as options_module
from zb_mom_ww_mxgateway.errors import MxGatewayTransportError
from zb_mom_ww_mxgateway.options import ClientOptions, create_channel
@@ -80,7 +83,9 @@ def test_create_channel_uses_tls_channel_tofu_default(monkeypatch: pytest.Monkey
_DUMMY_PEM = "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n"
get_cert_calls: list[tuple[str, int]] = []
def fake_get_server_certificate(addr: tuple[str, int]) -> str:
def fake_get_server_certificate(
addr: tuple[str, int], *, timeout: float | None = None
) -> str:
get_cert_calls.append(addr)
return _DUMMY_PEM
@@ -133,7 +138,7 @@ def test_create_channel_uses_tls_channel_tofu_respects_server_name_override(
monkeypatch.setattr(
options_module.ssl,
"get_server_certificate",
lambda addr: _DUMMY_PEM,
lambda addr, *, timeout=None: _DUMMY_PEM,
)
cred_calls: list[object] = []
@@ -276,3 +281,46 @@ def test_create_channel_uses_tls_channel_ca_file(
],
),
]
def test_tofu_probe_passes_a_bounded_timeout(monkeypatch: pytest.MonkeyPatch) -> None:
"""The TOFU cert pre-fetch must be bounded so a black-holed host fails fast."""
captured: dict[str, object] = {}
def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str:
captured["timeout"] = timeout
return "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n"
monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate)
monkeypatch.setattr(options_module.grpc, "ssl_channel_credentials", lambda **_: "creds")
monkeypatch.setattr(
options_module.grpc.aio,
"secure_channel",
lambda endpoint, credentials, *, options: "tls-channel",
)
create_channel(ClientOptions(endpoint="gateway.example:5001", call_timeout=7.5))
# A finite, positive timeout must be supplied (bounded by call_timeout here).
assert isinstance(captured["timeout"], (int, float))
assert 0 < captured["timeout"] <= 7.5
@pytest.mark.parametrize(
"raised",
[socket.timeout("timed out"), TimeoutError("timed out"), OSError("connection refused")],
)
def test_tofu_probe_timeout_raises_transport_error(
monkeypatch: pytest.MonkeyPatch, raised: Exception
) -> None:
"""A timed-out / failed probe surfaces as MxGatewayTransportError, not a raw error."""
def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str:
raise raised
monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate)
options = ClientOptions(endpoint="gateway.example:5001")
with pytest.raises(MxGatewayTransportError) as excinfo:
create_channel(options)
assert options.endpoint in str(excinfo.value)
+65
View File
@@ -2,14 +2,79 @@
import json
import pytest
from click.testing import CliRunner
from zb_mom_ww_mxgateway import __version__
from zb_mom_ww_mxgateway_cli import commands as commands_module
from zb_mom_ww_mxgateway_cli.commands import main
_BATCH_EOR = "__MXGW_BATCH_EOR__"
def test_require_certificate_validation_flag_flows_through_connect(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The --require-certificate-validation CLI flag must reach ClientOptions (Client.Python-027)."""
captured: dict[str, object] = {}
async def fake_connect(options, **_kwargs):
captured["options"] = options
# Return a minimal object that supports the async context-manager protocol
# used by every CLI command body (async with await _connect(...) as client).
return _FakeAsyncClient()
monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect)
result = CliRunner().invoke(
main,
[
"open-session",
"--endpoint",
"gateway.example:5001",
"--require-certificate-validation",
"--json",
],
)
assert result.exit_code == 0, result.output
assert captured["options"].require_certificate_validation is True
def test_require_certificate_validation_defaults_off(monkeypatch: pytest.MonkeyPatch) -> None:
"""Without the flag the strict-validation posture stays off (TOFU default)."""
captured: dict[str, object] = {}
async def fake_connect(options, **_kwargs):
captured["options"] = options
return _FakeAsyncClient()
monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect)
result = CliRunner().invoke(
main,
["open-session", "--endpoint", "gateway.example:5001", "--plaintext", "--json"],
)
assert result.exit_code == 0, result.output
assert captured["options"].require_certificate_validation is False
class _FakeAsyncClient:
"""Minimal async-context-manager fake satisfying the open-session command body."""
async def __aenter__(self) -> "_FakeAsyncClient":
return self
async def __aexit__(self, *_exc: object) -> None:
return None
async def open_session_raw(self, *_args, **_kwargs):
from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb
return pb.OpenSessionReply(session_id="cli-test-session")
def test_version_json_is_deterministic() -> None:
runner = CliRunner()
@@ -8,9 +8,107 @@ from typing import Any
import pytest
from zb_mom_ww_mxgateway import ClientOptions, GatewayClient, MxAccessError
from zb_mom_ww_mxgateway import client as client_module
from zb_mom_ww_mxgateway import galaxy as galaxy_module
from zb_mom_ww_mxgateway.galaxy import GalaxyRepositoryClient
from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb
@pytest.mark.asyncio
async def test_gateway_connect_forwards_require_certificate_validation(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The connect convenience kwarg must reach ClientOptions (Client.Python-027)."""
captured: dict[str, Any] = {}
def fake_create_channel(options: ClientOptions) -> object:
captured["options"] = options
return object()
monkeypatch.setattr(client_module, "create_channel", fake_create_channel)
monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object())
await GatewayClient.connect(
endpoint="gateway.example:5001",
require_certificate_validation=True,
)
assert captured["options"].require_certificate_validation is True
@pytest.mark.asyncio
async def test_galaxy_connect_forwards_require_certificate_validation(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""GalaxyRepositoryClient.connect must thread the flag too (Client.Python-027)."""
captured: dict[str, Any] = {}
def fake_create_channel(options: ClientOptions) -> object:
captured["options"] = options
return object()
monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel)
monkeypatch.setattr(
galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object()
)
await GalaxyRepositoryClient.connect(
endpoint="gateway.example:5001",
require_certificate_validation=True,
)
assert captured["options"].require_certificate_validation is True
@pytest.mark.asyncio
async def test_gateway_connect_runs_create_channel_off_the_event_loop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""connect must run the blocking channel factory off the loop (Client.Python-028)."""
ran_in_thread: dict[str, bool] = {}
def fake_create_channel(options: ClientOptions) -> object:
# If this runs on the event loop thread, get_running_loop() succeeds.
try:
asyncio.get_running_loop()
ran_in_thread["off_loop"] = False
except RuntimeError:
ran_in_thread["off_loop"] = True
return object()
monkeypatch.setattr(client_module, "create_channel", fake_create_channel)
monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object())
await GatewayClient.connect(endpoint="gateway.example:5001")
assert ran_in_thread["off_loop"] is True
@pytest.mark.asyncio
async def test_galaxy_connect_runs_create_channel_off_the_event_loop(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""GalaxyRepositoryClient.connect must also run the probe off the loop (Client.Python-028)."""
ran_in_thread: dict[str, bool] = {}
def fake_create_channel(options: ClientOptions) -> object:
try:
asyncio.get_running_loop()
ran_in_thread["off_loop"] = False
except RuntimeError:
ran_in_thread["off_loop"] = True
return object()
monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel)
monkeypatch.setattr(
galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object()
)
await GalaxyRepositoryClient.connect(endpoint="gateway.example:5001")
assert ran_in_thread["off_loop"] is True
@pytest.mark.asyncio
async def test_session_helpers_send_auth_metadata_and_preserve_raw_replies() -> None:
stub = FakeGatewayStub()
+11
View File
@@ -134,6 +134,17 @@ def test_split_authority_parses_host_and_port() -> None:
assert _split_authority(":5120") == ("localhost", 5120)
def test_split_authority_defaults_port_for_portless_endpoint() -> None:
from zb_mom_ww_mxgateway.options import _split_authority
# A bare hostname (no ":port") must default to 443, not crash on int("mygateway").
assert _split_authority("mygateway") == ("mygateway", 443)
# Scheme-prefixed bare hostname behaves the same.
assert _split_authority("https://mygateway") == ("mygateway", 443)
# A non-numeric tail after a colon is treated as no explicit port.
assert _split_authority("mygateway:") == ("mygateway", 443)
def test_split_authority_strips_ipv6_brackets() -> None:
from zb_mom_ww_mxgateway.options import _split_authority
+270 -3
View File
@@ -4,16 +4,48 @@
|---|---|
| Module | `clients/python` |
| Reviewer | Claude Code |
| Review date | 2026-05-24 |
| Commit reviewed | `42b0037` |
| Review date | 2026-06-15 |
| Commit reviewed | `410acc9` |
| Status | Re-reviewed |
| Open findings | 0 |
## Checklist coverage
### 2026-06-15 re-review (commit 410acc9)
Re-review pass at `410acc9`. The diff against the previous review base
`42b0037` covers: PyPI metadata + Gitea PyPI-feed install instructions in
`pyproject.toml` / `README.md`; a new lazy Galaxy browse surface
(`GalaxyRepositoryClient.browse_children_raw` / `browse` / `_iter_browse_children`,
the `LazyBrowseNode` walker, and `BrowseChildrenOptions`); a TLS
trust-on-first-use (TOFU) default in `options.py` gated by a new
`ClientOptions.require_certificate_validation` flag; the `_use_plaintext`
TLS-default contract carried forward; and the `batch` `CliRunner`-removal
follow-through. The new browse / TOFU surface is well tested
(`tests/test_galaxy.py`, `tests/test_auth_options.py`, `tests/test_tls.py`).
`python -m pytest` passes (80 passed, 1 skipped — the loopback-TLS test is
opt-in via `MXGATEWAY_RUN_TLS_TESTS=1`). `python -m pip wheel .` builds the
wheel cleanly against the installed setuptools 82.0.1.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | Issue found: `_split_authority` raises an uncaught `ValueError` for a port-less endpoint instead of a transport error (Client.Python-029). |
| 2 | mxaccessgw conventions | No new issues found — secrets still redacted, generated code untouched, no committed tokens in the new Gitea feed URLs (placeholders only). |
| 3 | Concurrency & thread safety | No new issues found — `LazyBrowseNode.expand` uses a per-node `asyncio.Lock` with a double-checked guard and is verified concurrent-safe by `test_browse_expand_concurrent_callers_only_fire_one_rpc`. |
| 4 | Error handling & resilience | Issue found: the TOFU branch calls the blocking `ssl.get_server_certificate` with no timeout from inside the `async def connect` path, blocking the event loop and hanging indefinitely on a black-holed host (Client.Python-028). |
| 5 | Security | Issue found: the new `require_certificate_validation` security flag is not reachable through the documented `connect(...)` convenience kwargs or any CLI flag, so callers using those paths are locked into TOFU and cannot force certificate validation (Client.Python-027). TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470). |
| 6 | Performance & resource management | No new issues found beyond the blocking TLS probe captured in Client.Python-028. |
| 7 | Design-document adherence | No new issues found — TOFU default, `require_certificate_validation` naming, and the BrowseChildren surface match `docs/GatewayConfiguration.md` / `docs/GalaxyRepository.md`; both README doc anchors resolve. |
| 8 | Code organization & conventions | Issue found: `pyproject.toml` uses the PEP 639-deprecated `license = { text = ... }` table form (Client.Python-030). pyproject metadata is otherwise correct and the wheel builds. |
| 9 | Testing coverage | Issue found: the `tls` pytest mark used by `tests/test_tls.py` is not registered in `[tool.pytest.ini_options]`, emitting a `PytestUnknownMarkWarning` (Client.Python-031). New browse / TOFU paths are otherwise well covered. |
| 10 | Documentation & comments | No new issues found — README TLS/browse/Gitea-feed prose matches the code; the alarm-CLI README examples corrected under Client.Python-022 remain correct. |
### Prior coverage (commit a020350)
A re-review at commit `a020350` over the same module. Prior findings
(Client.Python-001 — Client.Python-017) remain closed and are kept as
history. This section reflects categories evaluated in this pass.
history. This section reflects categories evaluated in that pass.
| # | Category | Result |
|---|---|---|
@@ -1171,3 +1203,238 @@ scope; `test_commands_module_bench_read_bulk_does_not_use_bare_except_pass`
greps the function source for the `except Exception:\n pass` pattern
and rejects it. Both tests failed against the pre-fix source and pass
against the fix.
### Client.Python-027
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Security |
| Location | `clients/python/src/zb_mom_ww_mxgateway/client.py:36-54`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:47-66`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:165-172,918-930` |
| Status | Resolved |
**Description:** This commit adds `ClientOptions.require_certificate_validation`
(default `False`) so a caller can force system-trust certificate verification
instead of the new lenient trust-on-first-use (TOFU) default. The flag is
honoured inside `create_channel`, but it is not surfaced through either of the
two documented ways a normal caller dials the gateway:
1. `GatewayClient.connect(...)` and `GalaxyRepositoryClient.connect(...)` accept
the convenience kwargs `endpoint` / `api_key` / `plaintext` / `ca_file` /
`server_name_override` and build the `ClientOptions` internally, but do **not**
accept or forward `require_certificate_validation`. The README's high-level
examples (e.g. the lazy-browse walker) use exactly this kwarg form
(`GalaxyRepositoryClient.connect(endpoint=..., api_key=..., plaintext=True)`),
so the kwarg path is the primary documented entry point.
2. The CLI exposes `--plaintext`, `--tls`, and `--ca-file` but no
`--require-certificate-validation` flag, and `_connect` constructs
`ClientOptions(...)` without setting the field. A CLI user connecting to a
TLS gateway is therefore locked into TOFU.
The net effect is that the *only* way to opt into real certificate validation is
to construct a `ClientOptions` instance directly and pass it as the positional
`options=` argument — a path neither the README nor the CLI documents. A
security-sensitive deployment that wants the strict (verify-against-system-trust)
posture cannot select it through the documented surface, so it silently stays on
TOFU. TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470
explicitly says "Python uses trust-on-first-use"), so this is an opt-in-to-strict
reachability gap rather than an insecure default — hence Medium with a workaround.
**Recommendation:** Add a `require_certificate_validation: bool = False` kwarg to
both `GatewayClient.connect` and `GalaxyRepositoryClient.connect` and forward it
into the constructed `ClientOptions`. Add a `--require-certificate-validation`
(or `--verify-tls`) flag to the shared CLI option set and wire it through
`_connect`. Add a test asserting the flag flows through to
`ClientOptions.require_certificate_validation` and a README note documenting how
to select the strict posture.
**Resolution:** 2026-06-15 — Confirmed: `connect` built `ClientOptions` from a
fixed kwarg set that omitted `require_certificate_validation`, and the CLI had no
flag, so the strict posture was only reachable via a hand-built `options=`. Added
a `require_certificate_validation: bool = False` kwarg to both
`GatewayClient.connect` and `GalaxyRepositoryClient.connect` (forwarded into the
constructed `ClientOptions`), a `--require-certificate-validation` flag to the
shared `gateway_options` CLI option set, and wired it through `_connect`. README
TLS section now documents the strict posture is reachable via the connect kwarg,
the options struct, and the CLI flag. Tests:
`tests/test_client_session.py::test_gateway_connect_forwards_require_certificate_validation`,
`::test_galaxy_connect_forwards_require_certificate_validation`,
`tests/test_cli.py::test_require_certificate_validation_flag_flows_through_connect`,
`::test_require_certificate_validation_defaults_off` — all failed before the fix
and pass after.
### Client.Python-028
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:120-130`, `clients/python/src/zb_mom_ww_mxgateway/client.py:59`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:71` |
| Status | Resolved |
**Description:** The TOFU branch of `create_channel` calls
`ssl.get_server_certificate((host, port))` to pre-fetch the server certificate.
`create_channel` is a synchronous function, but it is invoked exclusively from
inside the `async def connect` classmethods of `GatewayClient` and
`GalaxyRepositoryClient` (`client.py:59`, `galaxy.py:71`). `ssl.get_server_certificate`
opens a real blocking TCP+TLS socket on the calling thread, so:
1. It **blocks the asyncio event loop** for the full duration of the connect/handshake.
This is at odds with the rest of the client, which is fully `async`.
2. It passes **no `timeout`** to `ssl.get_server_certificate`. The `test_tofu_connect_failure_raises_transport_error`
test only proves the *connection-refused* case (a closed port returns fast).
A black-holed / firewall-drop host (packets silently dropped) makes the
underlying `socket.create_connection` hang on the OS default connect timeout,
which can be minutes, with the event loop frozen the whole time. A caller that
wrapped `connect` in `asyncio.wait_for(...)` cannot cancel it because the block
is in synchronous C, not at an `await` point.
The other TLS branches (`ca_file`, `require_certificate_validation`) build the
channel lazily and return immediately, so only the lenient default — the most
common path — has this hazard.
**Recommendation:** Pass an explicit `timeout=` to `ssl.get_server_certificate`
(it accepts one), bounded by `options.call_timeout` or a short fixed value, so a
black-holed host fails fast as a `MxGatewayTransportError` instead of hanging.
Better, run the synchronous probe off the event loop — make the TOFU pre-fetch
path awaitable (e.g. wrap it in `asyncio.get_running_loop().run_in_executor(...)`
from an `async` channel factory, or document that `connect` must not be called
from a running loop). Add a regression test that asserts the probe honours a
timeout.
**Resolution:** 2026-06-15 — Confirmed: the TOFU branch called
`ssl.get_server_certificate((host, port))` with no timeout from the synchronous
`create_channel`, which both `connect` classmethods invoked directly on the event
loop. Fix is two-part: (1) `create_channel` now passes
`timeout=options.call_timeout` (falling back to a fixed
`_TOFU_PROBE_TIMEOUT_SECONDS = 10.0` when no call_timeout is set) to
`ssl.get_server_certificate`, and the existing `except OSError` wraps a
timeout/connect failure into `MxGatewayTransportError` (TimeoutError/socket.timeout
are OSError subclasses); (2) both `GatewayClient.connect` and
`GalaxyRepositoryClient.connect` now run the blocking factory off the loop via
`await asyncio.to_thread(create_channel, resolved)`, so the event loop is never
frozen and a caller's `asyncio.wait_for` can cancel the connect. Tests:
`tests/test_auth_options.py::test_tofu_probe_passes_a_bounded_timeout`,
`::test_tofu_probe_timeout_raises_transport_error` (parametrized over
socket.timeout / TimeoutError / OSError), and
`tests/test_client_session.py::test_gateway_connect_runs_create_channel_off_the_event_loop`,
`::test_galaxy_connect_runs_create_channel_off_the_event_loop`. The timeout and
off-loop tests failed before the fix and pass after.
### Client.Python-029
| Field | Value |
|---|---|
| Severity | Low |
| Category | Correctness & logic bugs |
| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:78-90` |
| Status | Resolved |
**Description:** `_split_authority` parses a non-bracketed target with
`host, _, port = target.rpartition(":")` and returns
`(host or "localhost", int(port) if port else 443)`. For a port-less endpoint
such as `"mygateway"`, `rpartition(":")` returns `("", "", "mygateway")`, so
`host` becomes `""` (→ `"localhost"`) and `port` becomes `"mygateway"`, and
`int("mygateway")` raises an uncaught `ValueError: invalid literal for int()`.
Because `_split_authority` is called *before* the `try/except OSError` guard in
`create_channel`, the failure escapes as a raw `ValueError` rather than the
intended `MxGatewayTransportError`, and the message does not name the endpoint.
Verified at runtime:
`_split_authority("mygateway")``ValueError: invalid literal for int() with base 10: 'mygateway'`.
gRPC targets normally carry an explicit port (`host:port`), so impact is narrow,
but a typo or a bare-hostname endpoint produces a confusing crash on the TOFU
default path. The bracketed-IPv6 and `host:port` cases are covered by tests; the
port-less case is not.
**Recommendation:** Treat a non-numeric / missing port as the default (443) and
keep the whole string as the host, e.g. detect a trailing `:<digits>` explicitly
rather than assuming the `rpartition` tail is numeric, or wrap the `int(port)`
conversion so a non-numeric tail falls back to host-only with the default port.
Add a `_split_authority("mygateway")` case to `tests/test_tls.py`.
**Resolution:** 2026-06-15 — Confirmed: `_split_authority("mygateway")` raised
`ValueError: invalid literal for int() with base 10: 'mygateway'` because
`rpartition(":")` put the whole string in the port slot. Rewrote the
non-bracketed branch to inspect the `rpartition` separator and the tail: no colon
→ whole target is the host with default port 443; a colon with a non-digit/empty
tail → left side is the host with default port 443; a digit tail → parse the
port. The bare-hostname case now returns `("mygateway", 443)` instead of raising,
and the existing `":5120"` / `"localhost:5120"` / IPv6 cases are unchanged. Test:
`tests/test_tls.py::test_split_authority_defaults_port_for_portless_endpoint`
(covers `"mygateway"`, `"https://mygateway"`, and `"mygateway:"`) — failed before
the fix and passes after.
### Client.Python-030
| Field | Value |
|---|---|
| Severity | Low |
| Category | Code organization & conventions |
| Location | `clients/python/pyproject.toml:17` |
| Status | Resolved |
**Description:** This commit re-adds a `license` key to `pyproject.toml` as the
table form `license = { text = "Proprietary" }`. Under PEP 639 (active in the
installed setuptools 82.0.1), the `[project.license]` **table** forms (`text` and
`file`) are deprecated in favour of the SPDX string expression, and a future
setuptools major may reject them — the same class of regression that
Client.Python-018 (the earlier `license = "Proprietary"` string, rejected because
`Proprietary` is not a valid SPDX identifier) recorded for this exact field. The
build currently succeeds (verified: `python -m pip wheel .` produces
`zb_mom_ww_mxaccess_gateway_client-0.1.0-py3-none-any.whl` and the metadata
carries `License: Proprietary` plus the `License :: Other/Proprietary License`
classifier), so this is a forward-looking maintainability flag, not a present
breakage. Note that pairing a `license` table with a `License ::` trove
classifier is also flagged by PyPI/twine as redundant under the new metadata
rules.
**Recommendation:** Prefer the PEP 639 SPDX-string form with a `LicenseRef-*`
custom identifier for an unlisted licence (`license = "LicenseRef-Proprietary"`)
— this is the future-proof equivalent of the intent and avoids the deprecated
table form — or drop the `license` key entirely and rely on the existing
`License :: Other/Proprietary License` classifier (the Client.Python-018
resolution chose this). The `tests/test_packaging.py::test_pip_wheel_build_succeeds`
guard (added under Client.Python-020) will catch the day a setuptools upgrade
turns the deprecation into a hard error.
**Resolution:** 2026-06-15 — Switched the deprecated `license = { text =
"Proprietary" }` table form to the PEP 639 SPDX-string form
`license = "LicenseRef-Proprietary"` (the future-proof custom identifier for an
unlisted/proprietary licence). Also removed the now-redundant
`License :: Other/Proprietary License` trove classifier, which setuptools >= 77
flags as conflicting when a `License-Expression` is present. The built wheel
metadata now carries `License-Expression: LicenseRef-Proprietary` and no
`Classifier: License ::` line. Verified by `python -m pip wheel . --no-deps`,
which builds cleanly; the existing
`tests/test_packaging.py::test_pip_wheel_build_succeeds` guard exercises the same
build and passes.
### Client.Python-031
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `clients/python/tests/test_tls.py:34`, `clients/python/pyproject.toml:53-56` |
| Status | Resolved |
**Description:** `tests/test_tls.py` applies a module-level
`pytestmark = pytest.mark.tls`, but the `tls` marker is not registered in
`[tool.pytest.ini_options]` (which declares only `addopts`, `pythonpath`, and
`testpaths`). Every run emits a `PytestUnknownMarkWarning: Unknown
pytest.mark.tls - is this a typo?`. The warning is benign today, but (a) it is
exactly the kind of typo the warning exists to catch, so a future genuine
mistyped marker would be lost in the noise, and (b) if the suite ever adopts
`filterwarnings = ["error"]` (a common hardening step), the unregistered marker
would turn into a hard collection failure.
**Recommendation:** Register the marker, e.g.
`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]`
under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`.
**Resolution:** 2026-06-15 — Registered the `tls` marker by adding
`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]`
under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`.
`python -m pytest` now reports no `PytestUnknownMarkWarning` (full run: 91
passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked
`tests/test_tls.py` module is the guard — its run is now warning-free.