Compare commits

..

2 Commits

Author SHA1 Message Date
Joseph Doherty dd7ca1634e Mark code-review findings Server-033..037 resolved
Records the resolutions for the five Galaxy snapshot-persistence findings
fixed in bdccdbf, and regenerates the code-reviews index. Server open
findings drop from 7 to 2 (Server-031, Server-032 remain — unrelated
event-channel backpressure findings).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 03:35:56 -04:00
Joseph Doherty bdccdbf6dd Resolve code-review findings Server-033..037
Server-033 (Medium): TryRestoreFromDiskAsync now completes the _firstLoad
gate once the restored snapshot is published, so a browse call racing the
first refresh is served immediately instead of waiting out the 5s bootstrap
budget while an unreachable-database query runs.

Server-034 (Low): GalaxyHierarchySnapshotStore.TryLoadAsync catches
JsonException / IOException / UnauthorizedAccessException and returns null,
honoring the Try contract for a corrupt or unreadable snapshot file.

Server-035 (Low): SaveAsync bounds the write with a linked CancellationToken
(CommandTimeoutSeconds budget) so a stuck disk cannot pin the refresh loop.

Server-036 (Low): PersistSnapshotAsync no longer logs a save cancelled by
gateway shutdown as a persistence failure.

Server-037 (Low): added cache tests for the corrupt-snapshot restore path
and for PersistSnapshot=false, plus a store test for corrupt JSON.

All 100 Galaxy tests pass; gateway builds clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 03:34:35 -04:00
6 changed files with 329 additions and 8 deletions
+10 -2
View File
@@ -17,7 +17,7 @@ Each module's `findings.md` is the source of truth; this file is generated from
| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 20 |
| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 15 |
| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 21 |
| [Server](Server/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 30 |
| [Server](Server/findings.md) | Claude Code | 2026-05-22 | `fa491c7` | Reviewed | 2 | 37 |
| [Tests](Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 24 |
| [Worker](Worker/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 25 |
| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 30 |
@@ -26,7 +26,10 @@ Each module's `findings.md` is the source of truth; this file is generated from
Findings with status `Open` or `In Progress`, ordered by severity.
_No pending findings._
| ID | Severity | Category | Location | Description |
|---|---|---|---|---|
| Server-031 | Medium | Concurrency & thread safety | `src/MxGateway.Server/Workers/WorkerClient.cs:392-422` (gateway-side heartbeat watchdog); `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:588-617` (worker-side heartbeat loop); `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:14,67-76` (shared `_writeLock`) | Surfaced during the 2026-05-20 cross-language e2e re-run against gateway `b794c46`. The .NET phase succeeded through `open-session`/`register`/`bulk-subscribe`/`bulk-read`/`bulk-unsubscribe`/`stream-events`/`write` but then failed on its t… |
| Server-032 | Medium | Error handling & resilience | `src/MxGateway.Server/Workers/WorkerClient.cs:70-77,463-484` (gateway-side `_events` channel); `src/MxGateway.Server/Configuration/EventOptions.cs:8` (default capacity 10,000); `src/MxGateway.Server/Grpc/EventStreamService.cs` (consumer) | Surfaced during the 2026-05-20 cross-language e2e re-run against gateway `b794c46`. The Java phase advised ~55 items (`item-handle 63`) before failing on the next `advise` call with the Server-030 diagnostic `Session ... is not ready. Sess… |
## Closed findings
@@ -94,6 +97,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Server-016 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Sessions/GatewaySession.cs:790-797`, `src/MxGateway.Server/Sessions/SessionManager.cs:237-258` |
| Server-021 | Medium | Resolved | Testing coverage | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:266-664`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs` |
| Server-030 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Sessions/GatewaySession.cs:952-980` |
| Server-033 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:265-323` (`TryRestoreFromDiskAsync`), `:84-99` (`_firstLoad` / `WaitForFirstLoadAsync`); `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:141-163` (`WaitForCacheBootstrap`) |
| Tests-003 | Medium | Resolved | Performance & resource management | `src/MxGateway.Tests/Security/Authentication/SqliteAuthStoreTests.cs:170-176`, `src/MxGateway.Tests/Security/Authentication/ApiKeyAdminCliRunnerTests.cs:252-258` |
| Tests-004 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs` |
| Tests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs:239-261`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` |
@@ -235,6 +239,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
| Server-027 | Low | Resolved | Design-document adherence | `docs/Authorization.md:120-141,176-181` |
| Server-028 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs:13-20`, `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs` |
| Server-029 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:52-58` |
| Server-034 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs:87-115` (`TryLoadAsync`) |
| Server-035 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:176` (call site), `:327-352` (`PersistSnapshotAsync`) |
| Server-036 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:345-348` (`PersistSnapshotAsync` catch) |
| Server-037 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs`, `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs` |
| Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` |
| Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` |
| Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` |
+100 -3
View File
@@ -4,10 +4,10 @@
|---|---|
| Module | `src/MxGateway.Server` |
| Reviewer | Claude Code |
| Review date | 2026-05-20 |
| Commit reviewed | `a020350` |
| Review date | 2026-05-22 |
| Commit reviewed | `fa491c7` |
| Status | Reviewed |
| Open findings | 0 |
| Open findings | 2 |
## Checklist coverage
@@ -47,6 +47,28 @@ Re-review pass at `a020350` — the cross-module sweep that resolved Server-015
| 9 | Testing coverage | Issues found: Server-028 (`GatewayGrpcScopeResolverTests` does not exercise `WatchDeployEventsRequest` or `MxCommandKind.ReadBulk`; no `GatewaySessionTests` case asserts a `MarkFaulted` during in-flight Close). |
| 10 | Documentation & comments | Issues found: Server-023 (`NotWiredAlarmRpcDispatcher` class XML doc still says "PR A.6/A.7 — default … shipped while the worker-side AlarmClient event subscription is gated on dev-rig validation"; contradicts the cleanup that Server-014/Server-022 applied to the interface, gateway service, and `WorkerAlarmRpcDispatcher`). Issues found: Server-029 (`OpenSession` capability list advertises `bulk-subscribe-commands` but not the now-shipping bulk-read or bulk-write families — clients that gate on capability strings have no signal that those families exist). |
### 2026-05-22 review (commit fa491c7)
Re-review pass at `fa491c7`, scoped to the Galaxy hierarchy snapshot-persistence
change: the new `GalaxyHierarchySnapshot`, `IGalaxyHierarchySnapshotStore` /
`GalaxyHierarchySnapshotStore`, the restore / persist paths added to
`GalaxyHierarchyCache`, the two new `GalaxyRepositoryOptions`, and the
`docs/GalaxyRepository.md` / `docs/GatewayConfiguration.md` updates. Prior
findings (Server-001 through Server-032) are unchanged by this pass.
| # | Category | Result |
|---|---|---|
| 1 | Correctness & logic bugs | No issues found — restore/save sequencing and the shared `BuildEntry` materialization are sound. |
| 2 | mxaccessgw conventions | No issues found — file-scoped namespaces, `sealed`, `Async` suffixes, Options pattern, and XML docs all conform; the snapshot persists Galaxy metadata (names/types), not tag values or secrets. |
| 3 | Concurrency & thread safety | No issues found — `_restoreAttempted` and `_current` are touched only under `_refreshGate`; `_current` is published via `Volatile.Write`; the store serializes its file I/O on a private `SemaphoreSlim`. |
| 4 | Error handling & resilience | Issues found: Server-033 (restore never completes `_firstLoad`, so a cold-start browse waits the full 5s bootstrap budget), Server-034 (`TryLoadAsync` throws on a corrupt file despite the `Try` prefix), Server-036 (a save cancelled at shutdown logs a misleading warning). |
| 5 | Security | No issues found — the snapshot holds non-secret Galaxy metadata, is written under `C:\ProgramData\MxGateway` alongside the auth DB, and restored rows flow the same materialization path as live SQL with no injection surface. |
| 6 | Performance & resource management | Issues found: Server-035 (the snapshot write is awaited on the refresh critical path under `_refreshGate` with no timeout). |
| 7 | Design-document adherence | No issues found — `docs/GalaxyRepository.md` and `docs/GatewayConfiguration.md` were updated in the same commit; `docs/DesignDecisions.md` already defers to `GalaxyRepository.md` as the Galaxy authority. |
| 8 | Code organization & conventions | No issues found — the new options live on `GalaxyRepositoryOptions`, the store is a registered singleton, and the on-disk envelope (`PersistedFile`) is a private nested record. |
| 9 | Testing coverage | Issues found: Server-037 (no test for the corrupt-snapshot restore path or for `PersistSnapshot = false` at the cache level). |
| 10 | Documentation & comments | No issues found — XML docs match behavior; the `GalaxyRepository.md` "On-disk snapshot" section documents the Stale-on-restore lifecycle. |
## Findings
### Server-001
@@ -568,3 +590,78 @@ The diagnostic `"Worker event channel rejected an event."` also does not name th
Add a regression test that advises N items without an active `StreamEvents` consumer, lets the channel fill, and asserts the produced fault message contains the channel-depth diagnostic (#2) — gated so that #3 is not required.
**Resolution:** _(empty until closed; on close, record the fixing commit SHA, the date, and a one-line description of the fix)_
### Server-033
| Field | Value |
|---|---|
| Severity | Medium |
| Category | Error handling & resilience |
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:265-323` (`TryRestoreFromDiskAsync`), `:84-99` (`_firstLoad` / `WaitForFirstLoadAsync`); `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:141-163` (`WaitForCacheBootstrap`) |
| Status | Resolved |
**Description:** `TryRestoreFromDiskAsync` populates `_current` with the on-disk snapshot (status `Stale`, `HasData == true`) but never completes the `_firstLoad` `TaskCompletionSource` — only the live-query paths (cheap / heavy / catch) in `RefreshCoreAsync` do. A `DiscoverHierarchy` or `GetLastDeployTime` call that arrives after gateway start but before the first refresh tick finishes sees `cache.Current` as `Empty` (status `Unknown`) when `WaitForCacheBootstrap` runs its initial check, so it falls through to `await WaitForFirstLoadAsync` with a 5-second budget. Restore then completes within milliseconds and makes the data available, but `_firstLoad` stays pending until the live query returns or fails. When the Galaxy database is unreachable — the exact scenario the snapshot feature exists for — the SQL connect attempt outlasts the 5s budget, so the caller waits the full 5 seconds before the budget elapses and the handler falls through to read the (already-restored) data. The result is correct, but the first browse calls after a cold offline start incur a needless ~5s latency, undercutting the feature's purpose.
**Recommendation:** Call `_firstLoad.TrySetResult()` at the end of `TryRestoreFromDiskAsync` once the restored entry is published — restored data is a valid completed first load. Add a regression test: a cache with a throwing repository plus a populated snapshot store should have `WaitForFirstLoadAsync` complete promptly after `RefreshAsync`, not block on the live query.
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `TryRestoreFromDiskAsync` calls `_firstLoad.TrySetResult()` immediately after publishing the restored entry, so a restored snapshot satisfies the bootstrap gate without waiting on the live query. New test `GalaxyHierarchyCacheTests.RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns` blocks the repository's deploy-time query and asserts `WaitForFirstLoadAsync` still completes from the snapshot.
### Server-034
| Field | Value |
|---|---|
| Severity | Low |
| Category | Error handling & resilience |
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs:87-115` (`TryLoadAsync`) |
| Status | Resolved |
**Description:** `TryLoadAsync` carries the `Try` prefix and its XML doc says it returns `null` "when none exists, persistence is disabled, or the on-disk file uses an unrecognized schema version." But a corrupt or partially written JSON file makes `JsonSerializer.DeserializeAsync` throw `JsonException`, and an unreadable file (locked, denied ACL) throws `IOException` / `UnauthorizedAccessException` — none of which the method catches. End-to-end behavior is still safe because the sole caller, `GalaxyHierarchyCache.TryRestoreFromDiskAsync`, wraps the call in a `catch (Exception)`; but the store's own `Try`-prefixed contract is violated, and any future caller would be surprised by the throw.
**Recommendation:** Catch `JsonException` and `IOException` (the latter covers the `UnauthorizedAccessException` family) inside `TryLoadAsync`, log a warning, and return `null` — consistent with the unrecognized-schema-version branch already present and with the `Try` naming. A corrupt cache file is an expected failure mode for a disk cache.
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `TryLoadAsync` now has a `catch (Exception) when (exception is JsonException or IOException or UnauthorizedAccessException)` that logs a warning and returns `null`. New test `GalaxyHierarchySnapshotStoreTests.TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull`.
### Server-035
| Field | Value |
|---|---|
| Severity | Low |
| Category | Performance & resource management |
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:176` (call site), `:327-352` (`PersistSnapshotAsync`) |
| Status | Resolved |
**Description:** After a heavy refresh, `RefreshCoreAsync` `await`s `PersistSnapshotAsync` while still holding `_refreshGate`, and the `SaveAsync` write has no timeout. The only caller of `RefreshAsync` is the sequential `GalaxyHierarchyRefreshService` loop, so a write that hangs — e.g. a `SnapshotCachePath` pointed at an unresponsive network share — blocks the gate and stalls all subsequent cache refreshes until gateway shutdown. Impact is bounded: clients keep being served the last entry (which flips to `Stale` after the 5-minute threshold), so this is a degradation rather than an outage, and the default `C:\ProgramData` path is local disk where a hang is unlikely.
**Recommendation:** Bound the snapshot write with a timeout — a linked `CancellationTokenSource` cancelling after, say, the SQL `CommandTimeoutSeconds` budget — so a stuck write fails fast and logs rather than pinning the refresh loop. Moving the write off the gate is an alternative but would need its own write-serialization.
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `SaveAsync` wraps the write in a `CancellationTokenSource.CreateLinkedTokenSource(cancellationToken)` cancelled after `Math.Max(1, CommandTimeoutSeconds)` seconds, so a stuck write fails fast instead of pinning the refresh loop. The timeout-expiry path itself is not unit-tested — exercising it would require a genuinely hanging filesystem.
### Server-036
| Field | Value |
|---|---|
| Severity | Low |
| Category | Error handling & resilience |
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:345-348` (`PersistSnapshotAsync` catch) |
| Status | Resolved |
**Description:** `PersistSnapshotAsync` passes the refresh `CancellationToken` to `SaveAsync` and catches every exception — including the `OperationCanceledException` thrown when that token is cancelled at gateway shutdown — in its general `catch (Exception)`, logging it as `Warning: "Failed to persist the Galaxy hierarchy snapshot to disk."`. A snapshot write interrupted by a normal shutdown is not a failure, but it surfaces as a misleading warning every time the gateway stops mid-write.
**Recommendation:** Let a cancellation-driven `OperationCanceledException` pass without the warning — e.g. add `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { }` before the general catch — matching the cancellation handling already used in `RefreshCoreAsync` and `TryRestoreFromDiskAsync`.
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `PersistSnapshotAsync` has a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)` ahead of the general catch, so a save aborted by gateway shutdown is silent while a genuine failure (including a write timeout) still logs. New test `GalaxyHierarchyCacheTests.RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure`.
### Server-037
| Field | Value |
|---|---|
| Severity | Low |
| Category | Testing coverage |
| Location | `src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs`, `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs` |
| Status | Resolved |
**Description:** The new snapshot tests cover the round-trip, missing-file, persistence-disabled, unrecognized-schema, and overwrite cases for the store, and the persist / restore-when-unreachable / promote-on-matching-deploy cases for the cache. Two resilience paths are untested: (1) `GalaxyHierarchyCache.TryRestoreFromDiskAsync`'s `catch` path when the snapshot file is corrupt — the cache must come up `Unavailable` rather than throwing; (2) the cache restore path when `PersistSnapshot = false` (the store yields `null` and the cache stays `Unavailable`). Both are the failure modes most likely to matter operationally.
**Recommendation:** Add a cache test that writes a corrupt snapshot file and asserts `RefreshAsync` with an unreachable repository leaves the cache `Unavailable` without throwing, and a test that confirms a `PersistSnapshot = false` store neither restores nor persists. If Server-034 is fixed, the corrupt-file test also pins the store's null-return.
**Resolution:** Resolved in `bdccdbf` (2026-05-22): added `GalaxyHierarchyCacheTests.RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing` and `RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk`, plus the `TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull` store test added for Server-034.
@@ -306,6 +306,12 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
attributes: snapshot.Attributes);
Volatile.Write(ref _current, restored);
// Restored data is a valid completed first load: unblock callers waiting on
// the bootstrap gate immediately, rather than making them wait out the full
// wait budget for a live query that — when the database is unreachable, the
// scenario this restore exists for — may not return for seconds.
_firstLoad.TrySetResult();
_notifier.Publish(new GalaxyDeployEventInfo(
Sequence: sequence,
ObservedAt: _timeProvider.GetUtcNow(),
@@ -342,6 +348,11 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes),
cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// The refresh was cancelled (gateway shutdown) before the write finished.
// That is not a persistence failure — do not log it as a warning.
}
catch (Exception exception)
{
_logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk.");
@@ -27,6 +27,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
};
private readonly string? _path;
private readonly TimeSpan _writeTimeout;
private readonly ILogger<GalaxyHierarchySnapshotStore>? _logger;
private readonly SemaphoreSlim _ioGate = new(1, 1);
@@ -41,6 +42,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
_path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath)
? value.SnapshotCachePath
: null;
_writeTimeout = TimeSpan.FromSeconds(Math.Max(1, value.CommandTimeoutSeconds));
_logger = logger;
}
@@ -58,6 +60,12 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
// Bound the write so a stuck disk — e.g. a SnapshotCachePath on an
// unresponsive network share — cannot stall the caller. On the cache
// refresh path that would otherwise pin the whole refresh loop.
using CancellationTokenSource writeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
writeCts.CancelAfter(_writeTimeout);
string? directory = Path.GetDirectoryName(_path);
if (!string.IsNullOrEmpty(directory))
{
@@ -67,7 +75,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
string tempPath = _path + ".tmp";
await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None))
{
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false);
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, writeCts.Token).ConfigureAwait(false);
}
File.Move(tempPath, _path, overwrite: true);
@@ -111,6 +119,17 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
return file.Snapshot;
}
catch (Exception exception) when (exception is JsonException or IOException or UnauthorizedAccessException)
{
// A corrupt, truncated, locked, or access-denied snapshot file is an
// expected failure mode for a disk cache — honor the Try contract and
// return null rather than throwing.
_logger?.LogWarning(
exception,
"Ignoring Galaxy hierarchy snapshot at {Path}: the file is unreadable or not valid JSON.",
_path);
return null;
}
finally
{
_ioGate.Release();
@@ -1,3 +1,4 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MxGateway.Server.Galaxy;
using MxGateway.Contracts.Proto.Galaxy;
@@ -208,6 +209,108 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
Assert.Equal(0, repository.GetAttributesCount);
}
/// <summary>
/// Verifies that a restored on-disk snapshot completes the first-load gate
/// immediately, so a browse call racing the first refresh is not blocked for
/// the full bootstrap budget while the live Galaxy query is still running.
/// Regression test for Server-033.
/// </summary>
[Fact]
public async Task RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns()
{
GalaxyHierarchySnapshotStore store = CreateStore();
await store.SaveAsync(
new GalaxyHierarchySnapshot(
LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero),
SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero),
Hierarchy: [SampleHierarchyRow()],
Attributes: [SampleAttributeRow()]),
CancellationToken.None);
GalaxyDeployNotifier notifier = new();
BlockingGalaxyRepository repository = new();
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
Task refresh = cache.RefreshAsync(CancellationToken.None);
// The live query is blocked inside the repository; first-load must still
// complete — from the restored snapshot — well within the wait budget.
await cache.WaitForFirstLoadAsync(CancellationToken.None).WaitAsync(TimeSpan.FromSeconds(5));
Assert.True(cache.Current.HasData);
Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status);
repository.Release();
await refresh.WaitAsync(TimeSpan.FromSeconds(5));
}
/// <summary>
/// Verifies a corrupt on-disk snapshot does not crash startup: the cache
/// ignores the unreadable file and comes up Unavailable when the database is
/// also unreachable. Regression test for Server-037.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing()
{
string path = CreateTempPath();
await File.WriteAllTextAsync(path, "{ this is not valid json");
GalaxyHierarchySnapshotStore store = CreateStore(path);
GalaxyDeployNotifier notifier = new();
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
await cache.RefreshAsync(CancellationToken.None);
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
Assert.False(cache.Current.HasData);
}
/// <summary>
/// Verifies that with snapshot persistence disabled the cache does not
/// restore from disk — an unreachable database leaves it Unavailable.
/// Regression test for Server-037.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk()
{
GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath(), persist: false);
GalaxyDeployNotifier notifier = new();
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
await cache.RefreshAsync(CancellationToken.None);
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
Assert.False(cache.Current.HasData);
}
/// <summary>
/// Verifies that a snapshot save aborted because the gateway is shutting down
/// (the refresh token is cancelled) is not logged as a persistence failure.
/// Regression test for Server-036.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure()
{
using CancellationTokenSource cts = new();
GalaxyDeployNotifier notifier = new();
StubGalaxyRepository repository = new(
deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc),
hierarchy: [SampleHierarchyRow()],
attributes: [SampleAttributeRow()]);
CancellingSaveStore store = new(cts);
RecordingLogger<GalaxyHierarchyCache> logger = new();
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), logger, store);
await cache.RefreshAsync(cts.Token);
Assert.DoesNotContain(
logger.Entries,
entry => entry.Level == LogLevel.Warning
&& entry.Message.Contains("persist", StringComparison.OrdinalIgnoreCase));
}
private static GalaxyHierarchyRow SampleHierarchyRow() => new()
{
GobjectId = 99,
@@ -228,20 +331,93 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
DataTypeName = "Float",
};
private GalaxyHierarchySnapshotStore CreateStore()
private string CreateTempPath()
{
string path = Path.Combine(
Path.GetTempPath(),
$"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json");
_tempPaths.Add(path);
return path;
}
private GalaxyHierarchySnapshotStore CreateStore() => CreateStore(CreateTempPath());
private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true)
{
GalaxyRepositoryOptions options = new()
{
PersistSnapshot = true,
PersistSnapshot = persist,
SnapshotCachePath = path,
};
return new GalaxyHierarchySnapshotStore(Options.Create(options));
}
/// <summary><see cref="IGalaxyRepository"/> whose deploy-time query blocks until released.</summary>
private sealed class BlockingGalaxyRepository : IGalaxyRepository
{
private readonly TaskCompletionSource _release = new(TaskCreationOptions.RunContinuationsAsynchronously);
public void Release() => _release.TrySetResult();
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false);
public async Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default)
{
await _release.Task.WaitAsync(ct).ConfigureAwait(false);
throw new InvalidOperationException("Galaxy repository unreachable");
}
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetHierarchyAsync should not be reached");
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetAttributesAsync should not be reached");
}
/// <summary>Snapshot store whose <see cref="SaveAsync"/> cancels the token mid-save.</summary>
private sealed class CancellingSaveStore(CancellationTokenSource cts) : IGalaxyHierarchySnapshotStore
{
public Task<GalaxyHierarchySnapshot?> TryLoadAsync(CancellationToken cancellationToken)
=> Task.FromResult<GalaxyHierarchySnapshot?>(null);
public Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken)
{
cts.Cancel();
cancellationToken.ThrowIfCancellationRequested();
return Task.CompletedTask;
}
}
/// <summary>Minimal <see cref="ILogger{T}"/> that records every emitted log entry.</summary>
private sealed class RecordingLogger<T> : ILogger<T>
{
public List<(LogLevel Level, string Message)> Entries { get; } = [];
public IDisposable BeginScope<TState>(TState state)
where TState : notnull => NullScope.Instance;
public bool IsEnabled(LogLevel logLevel) => true;
public void Log<TState>(
LogLevel logLevel,
EventId eventId,
TState state,
Exception? exception,
Func<TState, Exception?, string> formatter)
{
Entries.Add((logLevel, formatter(state, exception)));
}
private sealed class NullScope : IDisposable
{
public static readonly NullScope Instance = new();
public void Dispose()
{
}
}
}
/// <summary>In-memory <see cref="IGalaxyRepository"/> that returns fixed rowsets.</summary>
private sealed class StubGalaxyRepository(
DateTime? deployTime,
@@ -59,6 +59,16 @@ public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
}
[Fact]
public async Task TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull()
{
string path = CreateTempPath();
await File.WriteAllTextAsync(path, "{ this is not valid json");
GalaxyHierarchySnapshotStore store = CreateStore(path);
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
}
[Fact]
public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull()
{