From fa491c752b289133814daed856f8f0c7d1ad0a11 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 22 May 2026 02:03:00 -0400 Subject: [PATCH] Persist the Galaxy browse dataset to disk for offline startup The gateway can lose connectivity to the Galaxy database, and the database is often unreachable exactly when the gateway restarts. The hierarchy cache was purely in-memory, so a cold start with no database left clients with an Unavailable browse surface until SQL came back. Add a JSON snapshot store: each successful heavy refresh writes the raw hierarchy and attribute rowsets to disk atomically (temp file + rename), and the first refresh after startup restores that snapshot before any SQL runs. Restored data is served as Stale until a live query confirms it; a live query that observes the same time_of_last_deploy promotes it to Healthy with no heavy re-query. Persistence is on by default (MxGateway:Galaxy:PersistSnapshot) and writes to C:\ProgramData\MxGateway\galaxy-snapshot.json. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/GalaxyRepository.md | 32 +++ docs/GatewayConfiguration.md | 6 +- .../Galaxy/GalaxyHierarchyCache.cs | 201 +++++++++++++++--- .../Galaxy/GalaxyHierarchySnapshot.cs | 24 +++ .../Galaxy/GalaxyHierarchySnapshotStore.cs | 122 +++++++++++ .../Galaxy/GalaxyRepositoryOptions.cs | 17 ++ ...xyRepositoryServiceCollectionExtensions.cs | 1 + .../Galaxy/IGalaxyHierarchySnapshotStore.cs | 28 +++ src/MxGateway.Server/appsettings.json | 4 +- .../Galaxy/GalaxyHierarchyCacheTests.cs | 169 ++++++++++++++- .../GalaxyHierarchySnapshotStoreTests.cs | 167 +++++++++++++++ 11 files changed, 739 insertions(+), 32 deletions(-) create mode 100644 src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshot.cs create mode 100644 src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs create mode 100644 src/MxGateway.Server/Galaxy/IGalaxyHierarchySnapshotStore.cs create mode 100644 src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs diff --git a/docs/GalaxyRepository.md b/docs/GalaxyRepository.md index 7353c53..8b784bf 100644 --- a/docs/GalaxyRepository.md +++ b/docs/GalaxyRepository.md @@ -89,6 +89,36 @@ load to complete before returning. If the first load fails or times out, the client gets `Unavailable` with a short reason. Once any load completes (success or failure), this wait is skipped on subsequent calls. +### On-disk snapshot + +The gateway may lose connectivity to the Galaxy database — and the database is +often unreachable right when the gateway itself restarts. To keep browse +working across that gap, the cache persists its dataset to disk: + +- After every successful **heavy** refresh (a deploy change), the raw + hierarchy and attribute rowsets are written to + `MxGateway:Galaxy:SnapshotCachePath` + (default `C:\ProgramData\MxGateway\galaxy-snapshot.json`). The write is + atomic — a temp file plus rename — so a crash mid-write cannot corrupt the + snapshot. Cheap no-change ticks write nothing; the file is already current. +- On the **first** refresh after startup, before any SQL runs, the cache + reloads that file. The restored data is served with `Stale` status — + it is last-known data, not live — so clients can browse immediately even + when the Galaxy database is unreachable. +- The first live query then reconciles: if it observes the **same** + `time_of_last_deploy` the snapshot was saved at, the entry is promoted to + `Healthy` with no heavy re-query (the snapshot is provably current); if it + observes a newer deploy, the heavy queries run and replace the snapshot; if + the database is still unreachable, the entry stays `Stale`. + +`is_alarm` / `is_historized` filters, paging, and the dashboard summary all +work against a restored snapshot exactly as against a live pull — the restore +path runs the same materialization. Persistence is disabled by setting +`MxGateway:Galaxy:PersistSnapshot` to `false`; the snapshot file is then +neither written nor read, and a cold start with an unreachable database comes +up `Unavailable` as before. The on-disk file is a cache, not a system of +record: deleting it only forces the next cold start to wait for live SQL. + ## Deploy Notifications `WatchDeployEvents` is a server-streaming RPC backed by @@ -291,6 +321,8 @@ Bound to `MxGateway:Galaxy` via `GalaxyRepositoryOptions`. |--------|---------|-------------| | `MxGateway:Galaxy:ConnectionString` | `Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;` | SQL Server connection string for the Galaxy Repository. Integrated Security against `localhost` is the dev default; production deployments should override this through the standard double-underscore environment variable form, e.g. `MxGateway__Galaxy__ConnectionString`. | | `MxGateway:Galaxy:CommandTimeoutSeconds` | `60` | Per-command SQL timeout. Applies to all three RPCs. | +| `MxGateway:Galaxy:PersistSnapshot` | `true` | Persists each successful browse dataset to disk and reloads it at startup. See [On-disk snapshot](#on-disk-snapshot). | +| `MxGateway:Galaxy:SnapshotCachePath` | `C:\ProgramData\MxGateway\galaxy-snapshot.json` | File path for the persisted browse snapshot. Ignored when `PersistSnapshot` is `false`. | The connection string is not treated as a secret in dev (`Integrated Security`), but production deployments that use SQL authentication should set diff --git a/docs/GatewayConfiguration.md b/docs/GatewayConfiguration.md index 1602d18..d38a09c 100644 --- a/docs/GatewayConfiguration.md +++ b/docs/GatewayConfiguration.md @@ -60,7 +60,9 @@ paths, timeouts, queue sizes, enum values, or protocol values are invalid. "Galaxy": { "ConnectionString": "Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;", "CommandTimeoutSeconds": 60, - "DashboardRefreshIntervalSeconds": 30 + "DashboardRefreshIntervalSeconds": 30, + "PersistSnapshot": true, + "SnapshotCachePath": "C:\\ProgramData\\MxGateway\\galaxy-snapshot.json" }, "Alarms": { "Enabled": false, @@ -170,6 +172,8 @@ at startup. | `MxGateway:Galaxy:ConnectionString` | `Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;` | SQL Server connection string for the Galaxy Repository (`ZB`) used by the `GalaxyRepository` browse RPCs. Override in production via `MxGateway__Galaxy__ConnectionString`. | | `MxGateway:Galaxy:CommandTimeoutSeconds` | `60` | Per-command SQL timeout for all Galaxy browse RPCs. | | `MxGateway:Galaxy:DashboardRefreshIntervalSeconds` | `30` | Interval between background refreshes of the dashboard Galaxy summary cache. SQL is hit at most once per interval regardless of dashboard render rate. | +| `MxGateway:Galaxy:PersistSnapshot` | `true` | Persists the latest successful Galaxy browse dataset to disk. When `true`, the cache reloads that snapshot at startup so clients can still browse last-known data while the Galaxy database is unreachable. The restored data is served with `Stale` status until a live query confirms it. | +| `MxGateway:Galaxy:SnapshotCachePath` | `C:\ProgramData\MxGateway\galaxy-snapshot.json` | File path for the persisted Galaxy browse snapshot. Ignored when `PersistSnapshot` is `false`. The snapshot is written atomically (temp file plus rename). | See [Galaxy Repository Browse](./GalaxyRepository.md) for the RPC surface and behavior. diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs index d2bfb3f..4ebadcb 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs @@ -12,6 +12,10 @@ namespace MxGateway.Server.Galaxy; /// refresh and reused across requests. Refreshes are deploy-time gated: every tick /// queries galaxy.time_of_last_deploy (cheap), and the heavy hierarchy + /// attributes rowsets are pulled only when that timestamp has advanced. +/// Each successful heavy refresh is persisted to disk through +/// ; the first refresh restores that +/// snapshot (as ) so clients can browse +/// last-known data when the Galaxy database is unreachable on a cold start. /// public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache { @@ -19,27 +23,35 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache private readonly IGalaxyRepository _repository; private readonly IGalaxyDeployNotifier _notifier; + private readonly IGalaxyHierarchySnapshotStore? _snapshotStore; private readonly TimeProvider _timeProvider; private readonly ILogger? _logger; private readonly TaskCompletionSource _firstLoad = new(TaskCreationOptions.RunContinuationsAsynchronously); private readonly SemaphoreSlim _refreshGate = new(1, 1); private GalaxyHierarchyCacheEntry _current = GalaxyHierarchyCacheEntry.Empty; + private bool _restoreAttempted; /// Initializes a new instance of the class. /// Galaxy Repository client for SQL queries. /// Galaxy deploy event notifier. /// Provider for current time; defaults to system time. /// Optional logger for diagnostic output. + /// + /// Optional on-disk snapshot store. When supplied, the cache persists each + /// successful refresh and restores the last snapshot on first load. + /// public GalaxyHierarchyCache( IGalaxyRepository repository, IGalaxyDeployNotifier notifier, TimeProvider? timeProvider = null, - ILogger? logger = null) + ILogger? logger = null, + IGalaxyHierarchySnapshotStore? snapshotStore = null) { _repository = repository; _notifier = notifier; _timeProvider = timeProvider ?? TimeProvider.System; _logger = logger; + _snapshotStore = snapshotStore; } /// Gets the current Galaxy hierarchy cache entry with projected status. @@ -88,6 +100,15 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache private async Task RefreshCoreAsync(CancellationToken cancellationToken) { + // First refresh only: seed the cache from the on-disk snapshot before + // querying SQL, so a cold start with an unreachable Galaxy database can + // still serve last-known browse data. Runs under the refresh gate. + if (!_restoreAttempted) + { + _restoreAttempted = true; + await TryRestoreFromDiskAsync(cancellationToken).ConfigureAwait(false); + } + GalaxyHierarchyCacheEntry previous = Volatile.Read(ref _current); DateTimeOffset queriedAt = _timeProvider.GetUtcNow(); @@ -130,41 +151,17 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache List hierarchy = hierarchyTask.Result; List attributes = attributesTask.Result; - IReadOnlyList objects = BuildObjects(hierarchy, attributes); - GalaxyHierarchyIndex index = GalaxyHierarchyIndex.Build(objects); - int areaCount = hierarchy.Count(row => row.IsArea); - int historized = attributes.Count(row => row.IsHistorized); - int alarms = attributes.Count(row => row.IsAlarm); - DashboardGalaxySummary dashboardSummary = BuildDashboardSummary( + long nextSequence = previous.Sequence + 1; + GalaxyHierarchyCacheEntry next = BuildEntry( status: GalaxyCacheStatus.Healthy, + sequence: nextSequence, lastQueriedAt: queriedAt, lastSuccessAt: queriedAt, lastDeployTime: deployTime, lastError: null, hierarchy: hierarchy, - objectCount: hierarchy.Count, - areaCount: areaCount, - attributeCount: attributes.Count, - historizedAttributeCount: historized, - alarmAttributeCount: alarms); - - long nextSequence = previous.Sequence + 1; - GalaxyHierarchyCacheEntry next = new( - Status: GalaxyCacheStatus.Healthy, - Sequence: nextSequence, - LastQueriedAt: queriedAt, - LastSuccessAt: queriedAt, - LastDeployTime: deployTime, - LastError: null, - Objects: objects, - Index: index, - DashboardSummary: dashboardSummary, - ObjectCount: hierarchy.Count, - AreaCount: areaCount, - AttributeCount: attributes.Count, - HistorizedAttributeCount: historized, - AlarmAttributeCount: alarms); + attributes: attributes); Volatile.Write(ref _current, next); _firstLoad.TrySetResult(); @@ -175,6 +172,8 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache TimeOfLastDeploy: deployTime, ObjectCount: hierarchy.Count, AttributeCount: attributes.Count)); + + await PersistSnapshotAsync(deployTime, queriedAt, hierarchy, attributes, cancellationToken).ConfigureAwait(false); } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { @@ -205,6 +204,150 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache } } + /// + /// Materializes a complete from raw + /// hierarchy and attribute rowsets. Shared by the live refresh path and the + /// on-disk restore path so both produce an identical object list, index, and + /// dashboard summary. + /// + private static GalaxyHierarchyCacheEntry BuildEntry( + GalaxyCacheStatus status, + long sequence, + DateTimeOffset? lastQueriedAt, + DateTimeOffset? lastSuccessAt, + DateTimeOffset? lastDeployTime, + string? lastError, + IReadOnlyList hierarchy, + IReadOnlyList attributes) + { + IReadOnlyList objects = BuildObjects(hierarchy, attributes); + GalaxyHierarchyIndex index = GalaxyHierarchyIndex.Build(objects); + + int areaCount = hierarchy.Count(row => row.IsArea); + int historized = attributes.Count(row => row.IsHistorized); + int alarms = attributes.Count(row => row.IsAlarm); + DashboardGalaxySummary dashboardSummary = BuildDashboardSummary( + status: status, + lastQueriedAt: lastQueriedAt, + lastSuccessAt: lastSuccessAt, + lastDeployTime: lastDeployTime, + lastError: lastError, + hierarchy: hierarchy, + objectCount: hierarchy.Count, + areaCount: areaCount, + attributeCount: attributes.Count, + historizedAttributeCount: historized, + alarmAttributeCount: alarms); + + return new GalaxyHierarchyCacheEntry( + Status: status, + Sequence: sequence, + LastQueriedAt: lastQueriedAt, + LastSuccessAt: lastSuccessAt, + LastDeployTime: lastDeployTime, + LastError: lastError, + Objects: objects, + Index: index, + DashboardSummary: dashboardSummary, + ObjectCount: hierarchy.Count, + AreaCount: areaCount, + AttributeCount: attributes.Count, + HistorizedAttributeCount: historized, + AlarmAttributeCount: alarms); + } + + /// + /// Seeds the cache from the on-disk snapshot when no live data has loaded yet. + /// The restored entry is marked — it is + /// last-known data, not live. A later refresh that observes the same deploy + /// time promotes it to healthy; one that observes a newer deploy replaces it. + /// + private async Task TryRestoreFromDiskAsync(CancellationToken cancellationToken) + { + if (_snapshotStore is null) + { + return; + } + + if (Volatile.Read(ref _current).HasData) + { + return; + } + + GalaxyHierarchySnapshot? snapshot; + try + { + snapshot = await _snapshotStore.TryLoadAsync(cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception exception) + { + _logger?.LogWarning(exception, "Failed to restore the Galaxy hierarchy from the on-disk snapshot."); + return; + } + + if (snapshot is null) + { + return; + } + + long sequence = Volatile.Read(ref _current).Sequence + 1; + GalaxyHierarchyCacheEntry restored = BuildEntry( + status: GalaxyCacheStatus.Stale, + sequence: sequence, + lastQueriedAt: snapshot.SavedAt, + lastSuccessAt: snapshot.SavedAt, + lastDeployTime: snapshot.LastDeployTime, + lastError: null, + hierarchy: snapshot.Hierarchy, + attributes: snapshot.Attributes); + Volatile.Write(ref _current, restored); + + _notifier.Publish(new GalaxyDeployEventInfo( + Sequence: sequence, + ObservedAt: _timeProvider.GetUtcNow(), + TimeOfLastDeploy: snapshot.LastDeployTime, + ObjectCount: snapshot.Hierarchy.Count, + AttributeCount: snapshot.Attributes.Count)); + + _logger?.LogInformation( + "Restored Galaxy hierarchy from on-disk snapshot saved {SavedAt:o}: {ObjectCount} objects, {AttributeCount} attributes (status Stale until the Galaxy database confirms).", + snapshot.SavedAt, + snapshot.Hierarchy.Count, + snapshot.Attributes.Count); + } + + /// + /// Persists a successful refresh to disk. Persistence failures are logged and + /// swallowed — a cache that cannot write its backup is still fully usable. + /// + private async Task PersistSnapshotAsync( + DateTimeOffset? deployTime, + DateTimeOffset savedAt, + IReadOnlyList hierarchy, + IReadOnlyList attributes, + CancellationToken cancellationToken) + { + if (_snapshotStore is null) + { + return; + } + + try + { + await _snapshotStore.SaveAsync( + new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes), + cancellationToken).ConfigureAwait(false); + } + catch (Exception exception) + { + _logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk."); + } + } + private static IReadOnlyList BuildObjects( IReadOnlyList hierarchy, IReadOnlyList attributes) diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshot.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshot.cs new file mode 100644 index 0000000..184545d --- /dev/null +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshot.cs @@ -0,0 +1,24 @@ +namespace MxGateway.Server.Galaxy; + +/// +/// A serializable point-in-time copy of the Galaxy Repository browse data. +/// Holds the raw hierarchy and attribute rowsets — not the materialized +/// protobuf objects — so the restore path runs the exact same +/// materialization as a live refresh. Persisted by +/// after a successful refresh +/// and reloaded at startup when the Galaxy database is unreachable. +/// +/// +/// The galaxy.time_of_last_deploy the rowsets were pulled at, or +/// when the Galaxy table reported no deploy. A later +/// live refresh that observes this same timestamp can promote the restored +/// entry to healthy without re-running the heavy queries. +/// +/// UTC wall-clock when the snapshot was written to disk. +/// The persisted object-hierarchy rowset. +/// The persisted attribute rowset. +public sealed record GalaxyHierarchySnapshot( + DateTimeOffset? LastDeployTime, + DateTimeOffset SavedAt, + IReadOnlyList Hierarchy, + IReadOnlyList Attributes); diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs new file mode 100644 index 0000000..7960f4a --- /dev/null +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs @@ -0,0 +1,122 @@ +using System.Text.Json; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; + +namespace MxGateway.Server.Galaxy; + +/// +/// JSON-file implementation of . +/// Writes the on-disk snapshot atomically (temp file + rename) so a crash +/// mid-write can never leave a torn file, and ignores files whose schema +/// version it does not recognize. When +/// is +/// both operations are no-ops. +/// +public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore +{ + /// + /// On-disk format version. Bump this whenever the persisted shape changes + /// in a way an older or newer gateway cannot read; a mismatched file is + /// ignored rather than misparsed. + /// + private const int CurrentSchemaVersion = 1; + + private static readonly JsonSerializerOptions SerializerOptions = new() + { + WriteIndented = false, + }; + + private readonly string? _path; + private readonly ILogger? _logger; + private readonly SemaphoreSlim _ioGate = new(1, 1); + + /// Initializes a new instance of the class. + /// Galaxy repository options carrying the snapshot path and enable flag. + /// Optional logger for diagnostic output. + public GalaxyHierarchySnapshotStore( + IOptions options, + ILogger? logger = null) + { + GalaxyRepositoryOptions value = options.Value; + _path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath) + ? value.SnapshotCachePath + : null; + _logger = logger; + } + + /// + public async Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(snapshot); + if (_path is null) + { + return; + } + + PersistedFile file = new(CurrentSchemaVersion, snapshot); + + await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + string? directory = Path.GetDirectoryName(_path); + if (!string.IsNullOrEmpty(directory)) + { + Directory.CreateDirectory(directory); + } + + string tempPath = _path + ".tmp"; + await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None)) + { + await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false); + } + + File.Move(tempPath, _path, overwrite: true); + _logger?.LogDebug( + "Persisted Galaxy hierarchy snapshot to {Path} ({ObjectCount} objects, {AttributeCount} attributes).", + _path, + snapshot.Hierarchy.Count, + snapshot.Attributes.Count); + } + finally + { + _ioGate.Release(); + } + } + + /// + public async Task TryLoadAsync(CancellationToken cancellationToken) + { + if (_path is null || !File.Exists(_path)) + { + return null; + } + + await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + PersistedFile? file; + await using (FileStream stream = new(_path, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + file = await JsonSerializer.DeserializeAsync( + stream, SerializerOptions, cancellationToken).ConfigureAwait(false); + } + + if (file is null || file.SchemaVersion != CurrentSchemaVersion || file.Snapshot is null) + { + _logger?.LogWarning( + "Ignoring Galaxy hierarchy snapshot at {Path}: unrecognized or empty schema version.", + _path); + return null; + } + + return file.Snapshot; + } + finally + { + _ioGate.Release(); + } + } + + /// On-disk envelope: a schema version plus the snapshot payload. + private sealed record PersistedFile(int SchemaVersion, GalaxyHierarchySnapshot? Snapshot); +} diff --git a/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs b/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs index 7007c0c..3e2d016 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs @@ -27,4 +27,21 @@ public sealed class GalaxyRepositoryOptions /// cache. SQL is hit at most once per interval regardless of dashboard render rate. /// public int DashboardRefreshIntervalSeconds { get; init; } = 30; + + /// Default on-disk path for the persisted Galaxy browse snapshot. + public const string DefaultSnapshotCachePath = + @"C:\ProgramData\MxGateway\galaxy-snapshot.json"; + + /// + /// Whether the gateway persists the latest successful Galaxy browse dataset to + /// disk. When enabled, the cache reloads that snapshot at startup so clients can + /// still browse last-known data while the Galaxy database is unreachable. + /// + public bool PersistSnapshot { get; init; } = true; + + /// + /// File path for the persisted Galaxy browse snapshot. Ignored when + /// is . + /// + public string SnapshotCachePath { get; init; } = DefaultSnapshotCachePath; } diff --git a/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs b/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs index 070470d..d7a9e26 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs @@ -19,6 +19,7 @@ public static class GalaxyRepositoryServiceCollectionExtensions services.AddSingleton(sp => sp.GetRequiredService()); services.AddSingleton(); + services.AddSingleton(); services.AddSingleton(); services.AddHostedService(); diff --git a/src/MxGateway.Server/Galaxy/IGalaxyHierarchySnapshotStore.cs b/src/MxGateway.Server/Galaxy/IGalaxyHierarchySnapshotStore.cs new file mode 100644 index 0000000..cc3b9bf --- /dev/null +++ b/src/MxGateway.Server/Galaxy/IGalaxyHierarchySnapshotStore.cs @@ -0,0 +1,28 @@ +namespace MxGateway.Server.Galaxy; + +/// +/// Persists the latest Galaxy Repository browse dataset to disk and reloads +/// it at startup. Lets serve last-known +/// browse data when the Galaxy database is unreachable on a cold start. +/// +public interface IGalaxyHierarchySnapshotStore +{ + /// + /// Writes to disk, replacing any previous + /// snapshot atomically. A no-op when snapshot persistence is disabled. + /// + /// The browse dataset to persist. + /// Token to cancel the asynchronous operation. + Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken); + + /// + /// Reads the persisted Galaxy browse dataset. + /// + /// Token to cancel the asynchronous operation. + /// + /// The persisted snapshot, or when none exists, + /// persistence is disabled, or the on-disk file uses an unrecognized + /// schema version. + /// + Task TryLoadAsync(CancellationToken cancellationToken); +} diff --git a/src/MxGateway.Server/appsettings.json b/src/MxGateway.Server/appsettings.json index 4d4da73..718be37 100644 --- a/src/MxGateway.Server/appsettings.json +++ b/src/MxGateway.Server/appsettings.json @@ -65,7 +65,9 @@ "Galaxy": { "ConnectionString": "Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;", "CommandTimeoutSeconds": 60, - "DashboardRefreshIntervalSeconds": 30 + "DashboardRefreshIntervalSeconds": 30, + "PersistSnapshot": true, + "SnapshotCachePath": "C:\\ProgramData\\MxGateway\\galaxy-snapshot.json" }, "Alarms": { "Enabled": true, diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs index 9ba783e..a5202d5 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs @@ -1,11 +1,14 @@ +using Microsoft.Extensions.Options; using MxGateway.Server.Galaxy; using MxGateway.Contracts.Proto.Galaxy; using MxGateway.Tests.TestSupport; namespace MxGateway.Tests.Galaxy; -public sealed class GalaxyHierarchyCacheTests +public sealed class GalaxyHierarchyCacheTests : IDisposable { + private readonly List _tempPaths = []; + /// /// Verifies cache returns empty entry before any refresh occurs. /// @@ -121,6 +124,170 @@ public sealed class GalaxyHierarchyCacheTests Assert.Same(root, index.ObjectViewsById[1].Object); } + /// + /// Verifies a successful refresh writes the browse dataset to the on-disk + /// snapshot store so a later cold start can restore it. + /// + [Fact] + public async Task RefreshAsync_WhenSuccessful_PersistsSnapshotToDisk() + { + GalaxyDeployNotifier notifier = new(); + StubGalaxyRepository repository = new( + deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc), + hierarchy: [SampleHierarchyRow()], + attributes: [SampleAttributeRow()]); + GalaxyHierarchySnapshotStore store = CreateStore(); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + await cache.RefreshAsync(CancellationToken.None); + + Assert.Equal(GalaxyCacheStatus.Healthy, cache.Current.Status); + GalaxyHierarchySnapshot? persisted = await store.TryLoadAsync(CancellationToken.None); + Assert.NotNull(persisted); + Assert.Equal(99, Assert.Single(persisted.Hierarchy).GobjectId); + Assert.Equal("PV", Assert.Single(persisted.Attributes).AttributeName); + } + + /// + /// Verifies that when the Galaxy database is unreachable on first refresh but a + /// snapshot exists on disk, the cache serves that data with Stale status + /// rather than coming up empty. + /// + [Fact] + public async Task RefreshAsync_WhenDatabaseUnreachableButSnapshotOnDisk_RestoresStaleData() + { + GalaxyHierarchySnapshotStore store = CreateStore(); + await store.SaveAsync( + new GalaxyHierarchySnapshot( + LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero), + SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero), + Hierarchy: [SampleHierarchyRow()], + Attributes: [SampleAttributeRow()]), + CancellationToken.None); + + GalaxyDeployNotifier notifier = new(); + ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable")); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + await cache.RefreshAsync(CancellationToken.None); + + Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status); + Assert.True(cache.Current.HasData); + Assert.Equal(1, cache.Current.ObjectCount); + Assert.Equal(1, cache.Current.AttributeCount); + Assert.NotNull(notifier.Latest); + } + + /// + /// Verifies that when the disk snapshot's deploy time still matches the live + /// Galaxy database, the cache promotes the restored data to Healthy + /// without re-running the heavy hierarchy and attribute queries. + /// + [Fact] + public async Task RefreshAsync_WhenSnapshotDeployMatchesLive_PromotesToHealthyWithoutHeavyQuery() + { + DateTime deployTime = new(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc); + GalaxyHierarchySnapshotStore store = CreateStore(); + await store.SaveAsync( + new GalaxyHierarchySnapshot( + LastDeployTime: new DateTimeOffset(deployTime, TimeSpan.Zero), + SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero), + Hierarchy: [SampleHierarchyRow()], + Attributes: [SampleAttributeRow()]), + CancellationToken.None); + + GalaxyDeployNotifier notifier = new(); + StubGalaxyRepository repository = new(deployTime); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + await cache.RefreshAsync(CancellationToken.None); + + Assert.Equal(GalaxyCacheStatus.Healthy, cache.Current.Status); + Assert.Equal(1, cache.Current.ObjectCount); + Assert.Equal(0, repository.GetHierarchyCount); + Assert.Equal(0, repository.GetAttributesCount); + } + + private static GalaxyHierarchyRow SampleHierarchyRow() => new() + { + GobjectId = 99, + TagName = "Pump_001", + ContainedName = "Pump", + BrowseName = "Pump", + CategoryId = 10, + TemplateChain = ["AppPump"], + }; + + private static GalaxyAttributeRow SampleAttributeRow() => new() + { + GobjectId = 99, + TagName = "Pump_001", + AttributeName = "PV", + FullTagReference = "Pump_001.PV", + MxDataType = 5, + DataTypeName = "Float", + }; + + private GalaxyHierarchySnapshotStore CreateStore() + { + string path = Path.Combine( + Path.GetTempPath(), + $"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json"); + _tempPaths.Add(path); + GalaxyRepositoryOptions options = new() + { + PersistSnapshot = true, + SnapshotCachePath = path, + }; + return new GalaxyHierarchySnapshotStore(Options.Create(options)); + } + + /// In-memory that returns fixed rowsets. + private sealed class StubGalaxyRepository( + DateTime? deployTime, + List? hierarchy = null, + List? attributes = null) : IGalaxyRepository + { + private readonly List _hierarchy = hierarchy ?? []; + private readonly List _attributes = attributes ?? []; + + public int GetHierarchyCount { get; private set; } + + public int GetAttributesCount { get; private set; } + + public Task TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(true); + + public Task GetLastDeployTimeAsync(CancellationToken ct = default) => Task.FromResult(deployTime); + + public Task> GetHierarchyAsync(CancellationToken ct = default) + { + GetHierarchyCount++; + return Task.FromResult(_hierarchy); + } + + public Task> GetAttributesAsync(CancellationToken ct = default) + { + GetAttributesCount++; + return Task.FromResult(_attributes); + } + } + + public void Dispose() + { + foreach (string path in _tempPaths) + { + try + { + File.Delete(path); + File.Delete(path + ".tmp"); + } + catch (IOException) + { + // Best-effort cleanup of test scratch files. + } + } + } + private sealed class ThrowingGalaxyRepository(Exception toThrow) : IGalaxyRepository { /// Gets the number of times was called. diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs new file mode 100644 index 0000000..c28546f --- /dev/null +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs @@ -0,0 +1,167 @@ +using Microsoft.Extensions.Options; +using MxGateway.Server.Galaxy; + +namespace MxGateway.Tests.Galaxy; + +/// +/// Covers : the on-disk persistence +/// that lets the Galaxy browse cache survive a cold start while the Galaxy +/// database is unreachable. +/// +public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable +{ + private readonly List _tempPaths = []; + + [Fact] + public async Task SaveAsync_ThenTryLoadAsync_RoundTripsRows() + { + string path = CreateTempPath(); + GalaxyHierarchySnapshotStore store = CreateStore(path); + GalaxyHierarchySnapshot snapshot = SampleSnapshot(); + + await store.SaveAsync(snapshot, CancellationToken.None); + GalaxyHierarchySnapshot? loaded = await store.TryLoadAsync(CancellationToken.None); + + Assert.NotNull(loaded); + Assert.Equal(snapshot.LastDeployTime, loaded.LastDeployTime); + Assert.Equal(snapshot.SavedAt, loaded.SavedAt); + + GalaxyHierarchyRow row = Assert.Single(loaded.Hierarchy); + Assert.Equal(7, row.GobjectId); + Assert.Equal("Pump_001", row.TagName); + Assert.Equal(["AppPump", "Pump"], row.TemplateChain); + + Assert.Equal(2, loaded.Attributes.Count); + GalaxyAttributeRow withDimension = loaded.Attributes[0]; + Assert.Equal("PV", withDimension.AttributeName); + Assert.Equal(8, withDimension.ArrayDimension); + Assert.True(withDimension.IsAlarm); + Assert.Null(loaded.Attributes[1].ArrayDimension); + } + + [Fact] + public async Task TryLoadAsync_WhenNoFileExists_ReturnsNull() + { + GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath()); + + Assert.Null(await store.TryLoadAsync(CancellationToken.None)); + } + + [Fact] + public async Task SaveAsync_WhenPersistenceDisabled_WritesNothing() + { + string path = CreateTempPath(); + GalaxyHierarchySnapshotStore store = CreateStore(path, persist: false); + + await store.SaveAsync(SampleSnapshot(), CancellationToken.None); + + Assert.False(File.Exists(path)); + Assert.Null(await store.TryLoadAsync(CancellationToken.None)); + } + + [Fact] + public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull() + { + string path = CreateTempPath(); + await File.WriteAllTextAsync(path, """{"SchemaVersion":999,"Snapshot":null}"""); + GalaxyHierarchySnapshotStore store = CreateStore(path); + + Assert.Null(await store.TryLoadAsync(CancellationToken.None)); + } + + [Fact] + public async Task SaveAsync_OverwritesAnEarlierSnapshot() + { + string path = CreateTempPath(); + GalaxyHierarchySnapshotStore store = CreateStore(path); + + await store.SaveAsync(SampleSnapshot(), CancellationToken.None); + GalaxyHierarchySnapshot second = SampleSnapshot() with + { + Hierarchy = [], + Attributes = [], + }; + await store.SaveAsync(second, CancellationToken.None); + + GalaxyHierarchySnapshot? loaded = await store.TryLoadAsync(CancellationToken.None); + Assert.NotNull(loaded); + Assert.Empty(loaded.Hierarchy); + Assert.Empty(loaded.Attributes); + } + + private static GalaxyHierarchySnapshot SampleSnapshot() => new( + LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 30, 0, TimeSpan.Zero), + SavedAt: new DateTimeOffset(2026, 5, 20, 9, 31, 0, TimeSpan.Zero), + Hierarchy: + [ + new GalaxyHierarchyRow + { + GobjectId = 7, + TagName = "Pump_001", + ContainedName = "Pump", + BrowseName = "Pump", + CategoryId = 10, + TemplateChain = ["AppPump", "Pump"], + }, + ], + Attributes: + [ + new GalaxyAttributeRow + { + GobjectId = 7, + TagName = "Pump_001", + AttributeName = "PV", + FullTagReference = "Pump_001.PV[]", + MxDataType = 5, + DataTypeName = "Float", + IsArray = true, + ArrayDimension = 8, + IsAlarm = true, + }, + new GalaxyAttributeRow + { + GobjectId = 7, + TagName = "Pump_001", + AttributeName = "Mode", + FullTagReference = "Pump_001.Mode", + MxDataType = 3, + DataTypeName = "Integer", + ArrayDimension = null, + }, + ]); + + private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true) + { + GalaxyRepositoryOptions options = new() + { + PersistSnapshot = persist, + SnapshotCachePath = path, + }; + return new GalaxyHierarchySnapshotStore(Options.Create(options)); + } + + private string CreateTempPath() + { + string path = Path.Combine( + Path.GetTempPath(), + $"mxgw-galaxy-snapshot-{Guid.NewGuid():N}.json"); + _tempPaths.Add(path); + return path; + } + + public void Dispose() + { + foreach (string path in _tempPaths) + { + try + { + File.Delete(path); + File.Delete(path + ".tmp"); + } + catch (IOException) + { + // Best-effort cleanup of test scratch files. + } + } + } +}