From 47062c1a6eacdf87e6f8ed577a551f86251c7d11 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 15 Jun 2026 02:39:11 -0400 Subject: [PATCH] fix(client/python): reachable cert-validation flag; bounded off-loop TOFU probe; license/marker fixes (Client.Python-027..031) --- clients/python/README.md | 13 +- clients/python/pyproject.toml | 6 +- .../python/src/zb_mom_ww_mxgateway/client.py | 6 +- .../python/src/zb_mom_ww_mxgateway/galaxy.py | 6 +- .../python/src/zb_mom_ww_mxgateway/options.py | 25 +- .../src/zb_mom_ww_mxgateway_cli/commands.py | 8 + clients/python/tests/test_auth_options.py | 52 +++- clients/python/tests/test_cli.py | 65 +++++ clients/python/tests/test_client_session.py | 98 +++++++ clients/python/tests/test_tls.py | 11 + code-reviews/Client.Python/findings.md | 273 +++++++++++++++++- 11 files changed, 550 insertions(+), 13 deletions(-) diff --git a/clients/python/README.md b/clients/python/README.md index 70b59d9..ed16b36 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -238,7 +238,11 @@ left `False`, the client fetches the gateway's presented certificate once to `localhost` (the generated certificate always carries a `localhost` SAN) when none was supplied. To verify instead, pass `ca_file` to verify against a specific CA, or set `require_certificate_validation=True` to verify against the system -trust roots. See +trust roots. The strict posture is reachable through every documented entry +point: the `require_certificate_validation=True` keyword on +`GatewayClient.connect(...)` / `GalaxyRepositoryClient.connect(...)`, the +`ClientOptions(require_certificate_validation=True)` struct, and the +`--require-certificate-validation` CLI flag. See [Gateway Configuration](../../docs/GatewayConfiguration.md#automatic-self-signed-certificate). ## CLI @@ -267,6 +271,13 @@ Use TLS options for a secured gateway: mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --ca-file C:\certs\mxgateway-ca.pem --server-name-override mxgateway.example.local --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json ``` +To force certificate validation against the system trust store instead of the +lenient trust-on-first-use default, add `--require-certificate-validation`: + +```powershell +mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --require-certificate-validation --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json +``` + ## Integration Checks Run live checks only when a gateway and MXAccess-backed worker are available: diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index eafb170..913b6c9 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -16,11 +16,10 @@ dependencies = [ authors = [ { name = "Joseph Doherty" }, ] -license = { text = "Proprietary" } +license = "LicenseRef-Proprietary" keywords = ["mxaccess", "mxgateway", "grpc", "client", "archestra"] classifiers = [ "Development Status :: 3 - Alpha", - "License :: Other/Proprietary License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", @@ -54,3 +53,6 @@ where = ["src"] addopts = "-ra" pythonpath = ["src"] testpaths = ["tests"] +markers = [ + "tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1", +] diff --git a/clients/python/src/zb_mom_ww_mxgateway/client.py b/clients/python/src/zb_mom_ww_mxgateway/client.py index e271788..1662a53 100644 --- a/clients/python/src/zb_mom_ww_mxgateway/client.py +++ b/clients/python/src/zb_mom_ww_mxgateway/client.py @@ -40,6 +40,7 @@ class GatewayClient: api_key: str | None = None, plaintext: bool = False, ca_file: str | None = None, + require_certificate_validation: bool = False, server_name_override: str | None = None, stub: Any | None = None, ) -> "GatewayClient": @@ -50,13 +51,16 @@ class GatewayClient: api_key=api_key, plaintext=plaintext, ca_file=ca_file, + require_certificate_validation=require_certificate_validation, server_name_override=server_name_override, ) if stub is not None: return cls(options=resolved, stub=stub) - channel = create_channel(resolved) + # create_channel may perform a blocking TLS certificate probe (TOFU + # default); run it off the event loop so connect never freezes the loop. + channel = await asyncio.to_thread(create_channel, resolved) return cls( options=resolved, stub=pb_grpc.MxAccessGatewayStub(channel), diff --git a/clients/python/src/zb_mom_ww_mxgateway/galaxy.py b/clients/python/src/zb_mom_ww_mxgateway/galaxy.py index 2195f8b..33fdb03 100644 --- a/clients/python/src/zb_mom_ww_mxgateway/galaxy.py +++ b/clients/python/src/zb_mom_ww_mxgateway/galaxy.py @@ -52,6 +52,7 @@ class GalaxyRepositoryClient: api_key: str | None = None, plaintext: bool = False, ca_file: str | None = None, + require_certificate_validation: bool = False, server_name_override: str | None = None, stub: Any | None = None, ) -> "GalaxyRepositoryClient": @@ -62,13 +63,16 @@ class GalaxyRepositoryClient: api_key=api_key, plaintext=plaintext, ca_file=ca_file, + require_certificate_validation=require_certificate_validation, server_name_override=server_name_override, ) if stub is not None: return cls(options=resolved, stub=stub) - channel = create_channel(resolved) + # create_channel may perform a blocking TLS certificate probe (TOFU + # default); run it off the event loop so connect never freezes the loop. + channel = await asyncio.to_thread(create_channel, resolved) return cls( options=resolved, stub=galaxy_pb_grpc.GalaxyRepositoryStub(channel), diff --git a/clients/python/src/zb_mom_ww_mxgateway/options.py b/clients/python/src/zb_mom_ww_mxgateway/options.py index 29caf29..f514c99 100644 --- a/clients/python/src/zb_mom_ww_mxgateway/options.py +++ b/clients/python/src/zb_mom_ww_mxgateway/options.py @@ -12,6 +12,10 @@ import grpc from .auth import REDACTED, ApiKey from .errors import MxGatewayTransportError +# Fallback bound for the TOFU certificate probe when no call_timeout is set, so a +# black-holed host fails fast instead of hanging on the OS default connect timeout. +_TOFU_PROBE_TIMEOUT_SECONDS = 10.0 + @dataclass(frozen=True) class ClientOptions: @@ -88,8 +92,17 @@ def _split_authority(endpoint: str) -> tuple[str, int]: remainder = target[bracket_end + 1 :] # ":5120" or "" port_str = remainder.lstrip(":") return (host, int(port_str) if port_str else 443) - host, _, port = target.rpartition(":") - return (host or "localhost", int(port) if port else 443) + host, sep, port = target.rpartition(":") + if not sep: + # No colon at all (e.g. a bare hostname "mygateway"): the whole target + # is the host; default the port rather than raising on int("mygateway"). + return (target or "localhost", 443) + if not port.isdigit(): + # A colon with a non-numeric / empty tail (e.g. a trailing ":") is not + # an explicit port — keep the left side as the host and default the + # port so a typo cannot raise an uncaught ValueError on the TOFU path. + return (host or "localhost", 443) + return (host or "localhost", int(port)) def create_channel(options: ClientOptions) -> grpc.aio.Channel: @@ -120,9 +133,15 @@ def create_channel(options: ClientOptions) -> grpc.aio.Channel: else: # Lenient default: grpc-python has no per-channel skip-verify, so fetch the # server's certificate (unverified) and pin it for this channel (TOFU). + # The probe opens a real blocking TCP+TLS socket, so it MUST be bounded — + # a black-holed / firewall-drop host would otherwise hang on the OS default + # connect timeout (minutes). Bound it by call_timeout (or a short fixed + # fallback) so the dial fails fast as a transport error. The async + # `connect` classmethods run this off the event loop (asyncio.to_thread). host, port = _split_authority(options.endpoint) + probe_timeout = options.call_timeout if options.call_timeout else _TOFU_PROBE_TIMEOUT_SECONDS try: - presented = ssl.get_server_certificate((host, port)) + presented = ssl.get_server_certificate((host, port), timeout=probe_timeout) except OSError as error: raise MxGatewayTransportError( f"failed to fetch TLS certificate from {options.endpoint}: {error}" diff --git a/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py b/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py index 21929d7..0c3b0b6 100644 --- a/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py +++ b/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py @@ -170,6 +170,13 @@ def gateway_options(command: Callable[..., Any]) -> Callable[..., Any]: command = click.option("--plaintext", is_flag=True, help="Use plaintext gRPC.")(command) command = click.option("--tls", "use_tls", is_flag=True, help="Use TLS gRPC.")(command) command = click.option("--ca-file", default=None, help="Custom root certificate file.")(command) + command = click.option( + "--require-certificate-validation", + "require_certificate_validation", + is_flag=True, + help="Verify the TLS certificate against the system trust store " + "instead of the lenient trust-on-first-use default.", + )(command) command = click.option( "--server-name-override", default=None, @@ -923,6 +930,7 @@ async def _connect(kwargs: dict[str, Any]) -> GatewayClient: api_key=api_key, plaintext=_use_plaintext(kwargs), ca_file=kwargs.get("ca_file"), + require_certificate_validation=bool(kwargs.get("require_certificate_validation")), server_name_override=kwargs.get("server_name_override"), call_timeout=kwargs.get("call_timeout"), stream_timeout=kwargs.get("stream_timeout"), diff --git a/clients/python/tests/test_auth_options.py b/clients/python/tests/test_auth_options.py index fa6c36c..5330535 100644 --- a/clients/python/tests/test_auth_options.py +++ b/clients/python/tests/test_auth_options.py @@ -1,9 +1,12 @@ """Tests for auth metadata and connection options.""" +import socket + import pytest from zb_mom_ww_mxgateway.auth import REDACTED, ApiKey, auth_metadata, redact_secret from zb_mom_ww_mxgateway import options as options_module +from zb_mom_ww_mxgateway.errors import MxGatewayTransportError from zb_mom_ww_mxgateway.options import ClientOptions, create_channel @@ -80,7 +83,9 @@ def test_create_channel_uses_tls_channel_tofu_default(monkeypatch: pytest.Monkey _DUMMY_PEM = "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n" get_cert_calls: list[tuple[str, int]] = [] - def fake_get_server_certificate(addr: tuple[str, int]) -> str: + def fake_get_server_certificate( + addr: tuple[str, int], *, timeout: float | None = None + ) -> str: get_cert_calls.append(addr) return _DUMMY_PEM @@ -133,7 +138,7 @@ def test_create_channel_uses_tls_channel_tofu_respects_server_name_override( monkeypatch.setattr( options_module.ssl, "get_server_certificate", - lambda addr: _DUMMY_PEM, + lambda addr, *, timeout=None: _DUMMY_PEM, ) cred_calls: list[object] = [] @@ -276,3 +281,46 @@ def test_create_channel_uses_tls_channel_ca_file( ], ), ] + + +def test_tofu_probe_passes_a_bounded_timeout(monkeypatch: pytest.MonkeyPatch) -> None: + """The TOFU cert pre-fetch must be bounded so a black-holed host fails fast.""" + captured: dict[str, object] = {} + + def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str: + captured["timeout"] = timeout + return "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n" + + monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate) + monkeypatch.setattr(options_module.grpc, "ssl_channel_credentials", lambda **_: "creds") + monkeypatch.setattr( + options_module.grpc.aio, + "secure_channel", + lambda endpoint, credentials, *, options: "tls-channel", + ) + + create_channel(ClientOptions(endpoint="gateway.example:5001", call_timeout=7.5)) + + # A finite, positive timeout must be supplied (bounded by call_timeout here). + assert isinstance(captured["timeout"], (int, float)) + assert 0 < captured["timeout"] <= 7.5 + + +@pytest.mark.parametrize( + "raised", + [socket.timeout("timed out"), TimeoutError("timed out"), OSError("connection refused")], +) +def test_tofu_probe_timeout_raises_transport_error( + monkeypatch: pytest.MonkeyPatch, raised: Exception +) -> None: + """A timed-out / failed probe surfaces as MxGatewayTransportError, not a raw error.""" + + def fake_get_server_certificate(addr: object, *, timeout: float | None = None) -> str: + raise raised + + monkeypatch.setattr(options_module.ssl, "get_server_certificate", fake_get_server_certificate) + + options = ClientOptions(endpoint="gateway.example:5001") + with pytest.raises(MxGatewayTransportError) as excinfo: + create_channel(options) + assert options.endpoint in str(excinfo.value) diff --git a/clients/python/tests/test_cli.py b/clients/python/tests/test_cli.py index 1e0e72e..ce4025f 100644 --- a/clients/python/tests/test_cli.py +++ b/clients/python/tests/test_cli.py @@ -2,14 +2,79 @@ import json +import pytest from click.testing import CliRunner from zb_mom_ww_mxgateway import __version__ +from zb_mom_ww_mxgateway_cli import commands as commands_module from zb_mom_ww_mxgateway_cli.commands import main _BATCH_EOR = "__MXGW_BATCH_EOR__" +def test_require_certificate_validation_flag_flows_through_connect( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """The --require-certificate-validation CLI flag must reach ClientOptions (Client.Python-027).""" + captured: dict[str, object] = {} + + async def fake_connect(options, **_kwargs): + captured["options"] = options + # Return a minimal object that supports the async context-manager protocol + # used by every CLI command body (async with await _connect(...) as client). + return _FakeAsyncClient() + + monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect) + + result = CliRunner().invoke( + main, + [ + "open-session", + "--endpoint", + "gateway.example:5001", + "--require-certificate-validation", + "--json", + ], + ) + + assert result.exit_code == 0, result.output + assert captured["options"].require_certificate_validation is True + + +def test_require_certificate_validation_defaults_off(monkeypatch: pytest.MonkeyPatch) -> None: + """Without the flag the strict-validation posture stays off (TOFU default).""" + captured: dict[str, object] = {} + + async def fake_connect(options, **_kwargs): + captured["options"] = options + return _FakeAsyncClient() + + monkeypatch.setattr(commands_module.GatewayClient, "connect", fake_connect) + + result = CliRunner().invoke( + main, + ["open-session", "--endpoint", "gateway.example:5001", "--plaintext", "--json"], + ) + + assert result.exit_code == 0, result.output + assert captured["options"].require_certificate_validation is False + + +class _FakeAsyncClient: + """Minimal async-context-manager fake satisfying the open-session command body.""" + + async def __aenter__(self) -> "_FakeAsyncClient": + return self + + async def __aexit__(self, *_exc: object) -> None: + return None + + async def open_session_raw(self, *_args, **_kwargs): + from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb + + return pb.OpenSessionReply(session_id="cli-test-session") + + def test_version_json_is_deterministic() -> None: runner = CliRunner() diff --git a/clients/python/tests/test_client_session.py b/clients/python/tests/test_client_session.py index 6a2af0e..ec00524 100644 --- a/clients/python/tests/test_client_session.py +++ b/clients/python/tests/test_client_session.py @@ -8,9 +8,107 @@ from typing import Any import pytest from zb_mom_ww_mxgateway import ClientOptions, GatewayClient, MxAccessError +from zb_mom_ww_mxgateway import client as client_module +from zb_mom_ww_mxgateway import galaxy as galaxy_module +from zb_mom_ww_mxgateway.galaxy import GalaxyRepositoryClient from zb_mom_ww_mxgateway.generated import mxaccess_gateway_pb2 as pb +@pytest.mark.asyncio +async def test_gateway_connect_forwards_require_certificate_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """The connect convenience kwarg must reach ClientOptions (Client.Python-027).""" + captured: dict[str, Any] = {} + + def fake_create_channel(options: ClientOptions) -> object: + captured["options"] = options + return object() + + monkeypatch.setattr(client_module, "create_channel", fake_create_channel) + monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object()) + + await GatewayClient.connect( + endpoint="gateway.example:5001", + require_certificate_validation=True, + ) + + assert captured["options"].require_certificate_validation is True + + +@pytest.mark.asyncio +async def test_galaxy_connect_forwards_require_certificate_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """GalaxyRepositoryClient.connect must thread the flag too (Client.Python-027).""" + captured: dict[str, Any] = {} + + def fake_create_channel(options: ClientOptions) -> object: + captured["options"] = options + return object() + + monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel) + monkeypatch.setattr( + galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object() + ) + + await GalaxyRepositoryClient.connect( + endpoint="gateway.example:5001", + require_certificate_validation=True, + ) + + assert captured["options"].require_certificate_validation is True + + +@pytest.mark.asyncio +async def test_gateway_connect_runs_create_channel_off_the_event_loop( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """connect must run the blocking channel factory off the loop (Client.Python-028).""" + ran_in_thread: dict[str, bool] = {} + + def fake_create_channel(options: ClientOptions) -> object: + # If this runs on the event loop thread, get_running_loop() succeeds. + try: + asyncio.get_running_loop() + ran_in_thread["off_loop"] = False + except RuntimeError: + ran_in_thread["off_loop"] = True + return object() + + monkeypatch.setattr(client_module, "create_channel", fake_create_channel) + monkeypatch.setattr(client_module.pb_grpc, "MxAccessGatewayStub", lambda channel: object()) + + await GatewayClient.connect(endpoint="gateway.example:5001") + + assert ran_in_thread["off_loop"] is True + + +@pytest.mark.asyncio +async def test_galaxy_connect_runs_create_channel_off_the_event_loop( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """GalaxyRepositoryClient.connect must also run the probe off the loop (Client.Python-028).""" + ran_in_thread: dict[str, bool] = {} + + def fake_create_channel(options: ClientOptions) -> object: + try: + asyncio.get_running_loop() + ran_in_thread["off_loop"] = False + except RuntimeError: + ran_in_thread["off_loop"] = True + return object() + + monkeypatch.setattr(galaxy_module, "create_channel", fake_create_channel) + monkeypatch.setattr( + galaxy_module.galaxy_pb_grpc, "GalaxyRepositoryStub", lambda channel: object() + ) + + await GalaxyRepositoryClient.connect(endpoint="gateway.example:5001") + + assert ran_in_thread["off_loop"] is True + + @pytest.mark.asyncio async def test_session_helpers_send_auth_metadata_and_preserve_raw_replies() -> None: stub = FakeGatewayStub() diff --git a/clients/python/tests/test_tls.py b/clients/python/tests/test_tls.py index d7c1c4b..e24ef51 100644 --- a/clients/python/tests/test_tls.py +++ b/clients/python/tests/test_tls.py @@ -134,6 +134,17 @@ def test_split_authority_parses_host_and_port() -> None: assert _split_authority(":5120") == ("localhost", 5120) +def test_split_authority_defaults_port_for_portless_endpoint() -> None: + from zb_mom_ww_mxgateway.options import _split_authority + + # A bare hostname (no ":port") must default to 443, not crash on int("mygateway"). + assert _split_authority("mygateway") == ("mygateway", 443) + # Scheme-prefixed bare hostname behaves the same. + assert _split_authority("https://mygateway") == ("mygateway", 443) + # A non-numeric tail after a colon is treated as no explicit port. + assert _split_authority("mygateway:") == ("mygateway", 443) + + def test_split_authority_strips_ipv6_brackets() -> None: from zb_mom_ww_mxgateway.options import _split_authority diff --git a/code-reviews/Client.Python/findings.md b/code-reviews/Client.Python/findings.md index b4a83f5..1fa1159 100644 --- a/code-reviews/Client.Python/findings.md +++ b/code-reviews/Client.Python/findings.md @@ -4,16 +4,48 @@ |---|---| | Module | `clients/python` | | Reviewer | Claude Code | -| Review date | 2026-05-24 | -| Commit reviewed | `42b0037` | +| Review date | 2026-06-15 | +| Commit reviewed | `410acc9` | | Status | Re-reviewed | | Open findings | 0 | ## Checklist coverage +### 2026-06-15 re-review (commit 410acc9) + +Re-review pass at `410acc9`. The diff against the previous review base +`42b0037` covers: PyPI metadata + Gitea PyPI-feed install instructions in +`pyproject.toml` / `README.md`; a new lazy Galaxy browse surface +(`GalaxyRepositoryClient.browse_children_raw` / `browse` / `_iter_browse_children`, +the `LazyBrowseNode` walker, and `BrowseChildrenOptions`); a TLS +trust-on-first-use (TOFU) default in `options.py` gated by a new +`ClientOptions.require_certificate_validation` flag; the `_use_plaintext` +TLS-default contract carried forward; and the `batch` `CliRunner`-removal +follow-through. The new browse / TOFU surface is well tested +(`tests/test_galaxy.py`, `tests/test_auth_options.py`, `tests/test_tls.py`). + +`python -m pytest` passes (80 passed, 1 skipped — the loopback-TLS test is +opt-in via `MXGATEWAY_RUN_TLS_TESTS=1`). `python -m pip wheel .` builds the +wheel cleanly against the installed setuptools 82.0.1. + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issue found: `_split_authority` raises an uncaught `ValueError` for a port-less endpoint instead of a transport error (Client.Python-029). | +| 2 | mxaccessgw conventions | No new issues found — secrets still redacted, generated code untouched, no committed tokens in the new Gitea feed URLs (placeholders only). | +| 3 | Concurrency & thread safety | No new issues found — `LazyBrowseNode.expand` uses a per-node `asyncio.Lock` with a double-checked guard and is verified concurrent-safe by `test_browse_expand_concurrent_callers_only_fire_one_rpc`. | +| 4 | Error handling & resilience | Issue found: the TOFU branch calls the blocking `ssl.get_server_certificate` with no timeout from inside the `async def connect` path, blocking the event loop and hanging indefinitely on a black-holed host (Client.Python-028). | +| 5 | Security | Issue found: the new `require_certificate_validation` security flag is not reachable through the documented `connect(...)` convenience kwargs or any CLI flag, so callers using those paths are locked into TOFU and cannot force certificate validation (Client.Python-027). TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470). | +| 6 | Performance & resource management | No new issues found beyond the blocking TLS probe captured in Client.Python-028. | +| 7 | Design-document adherence | No new issues found — TOFU default, `require_certificate_validation` naming, and the BrowseChildren surface match `docs/GatewayConfiguration.md` / `docs/GalaxyRepository.md`; both README doc anchors resolve. | +| 8 | Code organization & conventions | Issue found: `pyproject.toml` uses the PEP 639-deprecated `license = { text = ... }` table form (Client.Python-030). pyproject metadata is otherwise correct and the wheel builds. | +| 9 | Testing coverage | Issue found: the `tls` pytest mark used by `tests/test_tls.py` is not registered in `[tool.pytest.ini_options]`, emitting a `PytestUnknownMarkWarning` (Client.Python-031). New browse / TOFU paths are otherwise well covered. | +| 10 | Documentation & comments | No new issues found — README TLS/browse/Gitea-feed prose matches the code; the alarm-CLI README examples corrected under Client.Python-022 remain correct. | + +### Prior coverage (commit a020350) + A re-review at commit `a020350` over the same module. Prior findings (Client.Python-001 — Client.Python-017) remain closed and are kept as -history. This section reflects categories evaluated in this pass. +history. This section reflects categories evaluated in that pass. | # | Category | Result | |---|---|---| @@ -1171,3 +1203,238 @@ scope; `test_commands_module_bench_read_bulk_does_not_use_bare_except_pass` greps the function source for the `except Exception:\n pass` pattern and rejects it. Both tests failed against the pre-fix source and pass against the fix. + +### Client.Python-027 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Security | +| Location | `clients/python/src/zb_mom_ww_mxgateway/client.py:36-54`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:47-66`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:165-172,918-930` | +| Status | Resolved | + +**Description:** This commit adds `ClientOptions.require_certificate_validation` +(default `False`) so a caller can force system-trust certificate verification +instead of the new lenient trust-on-first-use (TOFU) default. The flag is +honoured inside `create_channel`, but it is not surfaced through either of the +two documented ways a normal caller dials the gateway: + +1. `GatewayClient.connect(...)` and `GalaxyRepositoryClient.connect(...)` accept + the convenience kwargs `endpoint` / `api_key` / `plaintext` / `ca_file` / + `server_name_override` and build the `ClientOptions` internally, but do **not** + accept or forward `require_certificate_validation`. The README's high-level + examples (e.g. the lazy-browse walker) use exactly this kwarg form + (`GalaxyRepositoryClient.connect(endpoint=..., api_key=..., plaintext=True)`), + so the kwarg path is the primary documented entry point. +2. The CLI exposes `--plaintext`, `--tls`, and `--ca-file` but no + `--require-certificate-validation` flag, and `_connect` constructs + `ClientOptions(...)` without setting the field. A CLI user connecting to a + TLS gateway is therefore locked into TOFU. + +The net effect is that the *only* way to opt into real certificate validation is +to construct a `ClientOptions` instance directly and pass it as the positional +`options=` argument — a path neither the README nor the CLI documents. A +security-sensitive deployment that wants the strict (verify-against-system-trust) +posture cannot select it through the documented surface, so it silently stays on +TOFU. TOFU itself is design-sanctioned (`docs/GatewayConfiguration.md` line 470 +explicitly says "Python uses trust-on-first-use"), so this is an opt-in-to-strict +reachability gap rather than an insecure default — hence Medium with a workaround. + +**Recommendation:** Add a `require_certificate_validation: bool = False` kwarg to +both `GatewayClient.connect` and `GalaxyRepositoryClient.connect` and forward it +into the constructed `ClientOptions`. Add a `--require-certificate-validation` +(or `--verify-tls`) flag to the shared CLI option set and wire it through +`_connect`. Add a test asserting the flag flows through to +`ClientOptions.require_certificate_validation` and a README note documenting how +to select the strict posture. + +**Resolution:** 2026-06-15 — Confirmed: `connect` built `ClientOptions` from a +fixed kwarg set that omitted `require_certificate_validation`, and the CLI had no +flag, so the strict posture was only reachable via a hand-built `options=`. Added +a `require_certificate_validation: bool = False` kwarg to both +`GatewayClient.connect` and `GalaxyRepositoryClient.connect` (forwarded into the +constructed `ClientOptions`), a `--require-certificate-validation` flag to the +shared `gateway_options` CLI option set, and wired it through `_connect`. README +TLS section now documents the strict posture is reachable via the connect kwarg, +the options struct, and the CLI flag. Tests: +`tests/test_client_session.py::test_gateway_connect_forwards_require_certificate_validation`, +`::test_galaxy_connect_forwards_require_certificate_validation`, +`tests/test_cli.py::test_require_certificate_validation_flag_flows_through_connect`, +`::test_require_certificate_validation_defaults_off` — all failed before the fix +and pass after. + +### Client.Python-028 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:120-130`, `clients/python/src/zb_mom_ww_mxgateway/client.py:59`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:71` | +| Status | Resolved | + +**Description:** The TOFU branch of `create_channel` calls +`ssl.get_server_certificate((host, port))` to pre-fetch the server certificate. +`create_channel` is a synchronous function, but it is invoked exclusively from +inside the `async def connect` classmethods of `GatewayClient` and +`GalaxyRepositoryClient` (`client.py:59`, `galaxy.py:71`). `ssl.get_server_certificate` +opens a real blocking TCP+TLS socket on the calling thread, so: + +1. It **blocks the asyncio event loop** for the full duration of the connect/handshake. + This is at odds with the rest of the client, which is fully `async`. +2. It passes **no `timeout`** to `ssl.get_server_certificate`. The `test_tofu_connect_failure_raises_transport_error` + test only proves the *connection-refused* case (a closed port returns fast). + A black-holed / firewall-drop host (packets silently dropped) makes the + underlying `socket.create_connection` hang on the OS default connect timeout, + which can be minutes, with the event loop frozen the whole time. A caller that + wrapped `connect` in `asyncio.wait_for(...)` cannot cancel it because the block + is in synchronous C, not at an `await` point. + +The other TLS branches (`ca_file`, `require_certificate_validation`) build the +channel lazily and return immediately, so only the lenient default — the most +common path — has this hazard. + +**Recommendation:** Pass an explicit `timeout=` to `ssl.get_server_certificate` +(it accepts one), bounded by `options.call_timeout` or a short fixed value, so a +black-holed host fails fast as a `MxGatewayTransportError` instead of hanging. +Better, run the synchronous probe off the event loop — make the TOFU pre-fetch +path awaitable (e.g. wrap it in `asyncio.get_running_loop().run_in_executor(...)` +from an `async` channel factory, or document that `connect` must not be called +from a running loop). Add a regression test that asserts the probe honours a +timeout. + +**Resolution:** 2026-06-15 — Confirmed: the TOFU branch called +`ssl.get_server_certificate((host, port))` with no timeout from the synchronous +`create_channel`, which both `connect` classmethods invoked directly on the event +loop. Fix is two-part: (1) `create_channel` now passes +`timeout=options.call_timeout` (falling back to a fixed +`_TOFU_PROBE_TIMEOUT_SECONDS = 10.0` when no call_timeout is set) to +`ssl.get_server_certificate`, and the existing `except OSError` wraps a +timeout/connect failure into `MxGatewayTransportError` (TimeoutError/socket.timeout +are OSError subclasses); (2) both `GatewayClient.connect` and +`GalaxyRepositoryClient.connect` now run the blocking factory off the loop via +`await asyncio.to_thread(create_channel, resolved)`, so the event loop is never +frozen and a caller's `asyncio.wait_for` can cancel the connect. Tests: +`tests/test_auth_options.py::test_tofu_probe_passes_a_bounded_timeout`, +`::test_tofu_probe_timeout_raises_transport_error` (parametrized over +socket.timeout / TimeoutError / OSError), and +`tests/test_client_session.py::test_gateway_connect_runs_create_channel_off_the_event_loop`, +`::test_galaxy_connect_runs_create_channel_off_the_event_loop`. The timeout and +off-loop tests failed before the fix and pass after. + +### Client.Python-029 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/python/src/zb_mom_ww_mxgateway/options.py:78-90` | +| Status | Resolved | + +**Description:** `_split_authority` parses a non-bracketed target with +`host, _, port = target.rpartition(":")` and returns +`(host or "localhost", int(port) if port else 443)`. For a port-less endpoint +such as `"mygateway"`, `rpartition(":")` returns `("", "", "mygateway")`, so +`host` becomes `""` (→ `"localhost"`) and `port` becomes `"mygateway"`, and +`int("mygateway")` raises an uncaught `ValueError: invalid literal for int()`. +Because `_split_authority` is called *before* the `try/except OSError` guard in +`create_channel`, the failure escapes as a raw `ValueError` rather than the +intended `MxGatewayTransportError`, and the message does not name the endpoint. +Verified at runtime: +`_split_authority("mygateway")` → `ValueError: invalid literal for int() with base 10: 'mygateway'`. +gRPC targets normally carry an explicit port (`host:port`), so impact is narrow, +but a typo or a bare-hostname endpoint produces a confusing crash on the TOFU +default path. The bracketed-IPv6 and `host:port` cases are covered by tests; the +port-less case is not. + +**Recommendation:** Treat a non-numeric / missing port as the default (443) and +keep the whole string as the host, e.g. detect a trailing `:` explicitly +rather than assuming the `rpartition` tail is numeric, or wrap the `int(port)` +conversion so a non-numeric tail falls back to host-only with the default port. +Add a `_split_authority("mygateway")` case to `tests/test_tls.py`. + +**Resolution:** 2026-06-15 — Confirmed: `_split_authority("mygateway")` raised +`ValueError: invalid literal for int() with base 10: 'mygateway'` because +`rpartition(":")` put the whole string in the port slot. Rewrote the +non-bracketed branch to inspect the `rpartition` separator and the tail: no colon +→ whole target is the host with default port 443; a colon with a non-digit/empty +tail → left side is the host with default port 443; a digit tail → parse the +port. The bare-hostname case now returns `("mygateway", 443)` instead of raising, +and the existing `":5120"` / `"localhost:5120"` / IPv6 cases are unchanged. Test: +`tests/test_tls.py::test_split_authority_defaults_port_for_portless_endpoint` +(covers `"mygateway"`, `"https://mygateway"`, and `"mygateway:"`) — failed before +the fix and passes after. + +### Client.Python-030 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/python/pyproject.toml:17` | +| Status | Resolved | + +**Description:** This commit re-adds a `license` key to `pyproject.toml` as the +table form `license = { text = "Proprietary" }`. Under PEP 639 (active in the +installed setuptools 82.0.1), the `[project.license]` **table** forms (`text` and +`file`) are deprecated in favour of the SPDX string expression, and a future +setuptools major may reject them — the same class of regression that +Client.Python-018 (the earlier `license = "Proprietary"` string, rejected because +`Proprietary` is not a valid SPDX identifier) recorded for this exact field. The +build currently succeeds (verified: `python -m pip wheel .` produces +`zb_mom_ww_mxaccess_gateway_client-0.1.0-py3-none-any.whl` and the metadata +carries `License: Proprietary` plus the `License :: Other/Proprietary License` +classifier), so this is a forward-looking maintainability flag, not a present +breakage. Note that pairing a `license` table with a `License ::` trove +classifier is also flagged by PyPI/twine as redundant under the new metadata +rules. + +**Recommendation:** Prefer the PEP 639 SPDX-string form with a `LicenseRef-*` +custom identifier for an unlisted licence (`license = "LicenseRef-Proprietary"`) +— this is the future-proof equivalent of the intent and avoids the deprecated +table form — or drop the `license` key entirely and rely on the existing +`License :: Other/Proprietary License` classifier (the Client.Python-018 +resolution chose this). The `tests/test_packaging.py::test_pip_wheel_build_succeeds` +guard (added under Client.Python-020) will catch the day a setuptools upgrade +turns the deprecation into a hard error. + +**Resolution:** 2026-06-15 — Switched the deprecated `license = { text = +"Proprietary" }` table form to the PEP 639 SPDX-string form +`license = "LicenseRef-Proprietary"` (the future-proof custom identifier for an +unlisted/proprietary licence). Also removed the now-redundant +`License :: Other/Proprietary License` trove classifier, which setuptools >= 77 +flags as conflicting when a `License-Expression` is present. The built wheel +metadata now carries `License-Expression: LicenseRef-Proprietary` and no +`Classifier: License ::` line. Verified by `python -m pip wheel . --no-deps`, +which builds cleanly; the existing +`tests/test_packaging.py::test_pip_wheel_build_succeeds` guard exercises the same +build and passes. + +### Client.Python-031 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/python/tests/test_tls.py:34`, `clients/python/pyproject.toml:53-56` | +| Status | Resolved | + +**Description:** `tests/test_tls.py` applies a module-level +`pytestmark = pytest.mark.tls`, but the `tls` marker is not registered in +`[tool.pytest.ini_options]` (which declares only `addopts`, `pythonpath`, and +`testpaths`). Every run emits a `PytestUnknownMarkWarning: Unknown +pytest.mark.tls - is this a typo?`. The warning is benign today, but (a) it is +exactly the kind of typo the warning exists to catch, so a future genuine +mistyped marker would be lost in the noise, and (b) if the suite ever adopts +`filterwarnings = ["error"]` (a common hardening step), the unregistered marker +would turn into a hard collection failure. + +**Recommendation:** Register the marker, e.g. +`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]` +under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`. + +**Resolution:** 2026-06-15 — Registered the `tls` marker by adding +`markers = ["tls: loopback TLS tests, opt-in via MXGATEWAY_RUN_TLS_TESTS=1"]` +under `[tool.pytest.ini_options]` in `clients/python/pyproject.toml`. +`python -m pytest` now reports no `PytestUnknownMarkWarning` (full run: 91 +passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked +`tests/test_tls.py` module is the guard — its run is now warning-free.