From 9d49cdcc58376edddd8406e8b5ea2a3033dac9fc Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 13 Apr 2026 15:40:44 -0400 Subject: [PATCH] Track Galaxy Platform and AppEngine runtime state via ScanState probes and proactively invalidate descendant variable quality on Stopped transitions so operators can detect a stopped runtime host before downstream clients read stale data and so the bridge delivers a uniform bad-quality signal instead of relying on MxAccess per-tag fan-out Co-Authored-By: Claude Opus 4.6 (1M context) --- gr/queries/hierarchy.sql | 2 + runtimestatus.md | 553 ++++++++++++++++++ .../Configuration/ConfigurationValidator.cs | 8 + .../Configuration/MxAccessConfiguration.cs | 17 + .../Domain/GalaxyObjectInfo.cs | 16 + .../Domain/GalaxyRuntimeState.cs | 29 + .../Domain/GalaxyRuntimeStatus.cs | 72 +++ .../GalaxyRepositoryService.cs | 6 +- .../MxAccess/GalaxyRuntimeProbeManager.cs | 404 +++++++++++++ .../OpcUa/LmxNodeManager.cs | 188 +++++- .../OpcUa/LmxOpcUaServer.cs | 12 +- .../OpcUa/OpcUaServerHost.cs | 10 +- src/ZB.MOM.WW.LmxOpcUa.Host/OpcUaService.cs | 3 +- .../Status/HealthCheckService.cs | 21 +- .../Status/StatusData.cs | 47 ++ .../Status/StatusReportService.cs | 57 +- src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json | 4 +- .../GalaxyRuntimeProbeManagerTests.cs | 396 +++++++++++++ 18 files changed, 1831 insertions(+), 14 deletions(-) create mode 100644 runtimestatus.md create mode 100644 src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs create mode 100644 src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs create mode 100644 src/ZB.MOM.WW.LmxOpcUa.Host/MxAccess/GalaxyRuntimeProbeManager.cs create mode 100644 tests/ZB.MOM.WW.LmxOpcUa.Tests/MxAccess/GalaxyRuntimeProbeManagerTests.cs diff --git a/gr/queries/hierarchy.sql b/gr/queries/hierarchy.sql index 74fc08d..524faa0 100644 --- a/gr/queries/hierarchy.sql +++ b/gr/queries/hierarchy.sql @@ -68,6 +68,8 @@ SELECT DISTINCT THEN 1 ELSE 0 END AS is_area, + td.category_id AS category_id, + g.hosted_by_gobject_id AS hosted_by_gobject_id, ISNULL( STUFF(( SELECT '|' + tc.template_tag_name diff --git a/runtimestatus.md b/runtimestatus.md new file mode 100644 index 0000000..66a9f2d --- /dev/null +++ b/runtimestatus.md @@ -0,0 +1,553 @@ +# Plan: Galaxy Runtime Status (Platform + AppEngine Stopped/Started Detection) + +## Context + +Today the bridge has no operator-visible signal for "is Galaxy Platform X or AppEngine Y stopped or running?". The dashboard shows: + +- **MXAccess state** — one bit of truth about whether the bridge can talk to the local MxAccess runtime at all. +- **Data change dispatch rate** — aggregate throughput across every advised attribute. + +Neither catches the case an operator actually cares about: a single Platform or AppEngine in a multi-host Galaxy has stopped (operator stopped it from the IDE, the node crashed, network cut, process died, someone toggled OffScan for maintenance). The bridge keeps serving cached values, downstream OPC UA clients see stale reads, and nobody notices until somebody specifically goes looking at the affected equipment. + +Galaxy exposes `.ScanState` as a boolean system attribute on every deployed `$WinPlatform` **and** `$AppEngine`. `true` means the object is on scan and executing; anything else means not running. AppEngine state is independently observable through MxAccess (even a stopped Engine's parent Platform can still route the query) so a single probe mechanism covers both host types. + +The goal is to advise `.ScanState` for every deployed `$WinPlatform` and `$AppEngine`, surface per-host runtime state on the dashboard, drive a `Degraded` health check rule when any is down, and publish the state into the OPC UA address space so external clients can subscribe alongside the value data they already consume. + +## Design + +### Probe tag: `.ScanState` + +`ScanState` is a boolean system attribute on every deployed `$WinPlatform` and `$AppEngine`. The classification rule: + +``` +isRunning = status.Success && vtq.Value is bool b && b +``` + +Everything else → **Stopped**. The `ItemStatus` fields (`category`, `detail`) are still captured into `LastError` for operator diagnostics, but they don't branch the state machine. + +#### On-change delivery semantic + +MxAccess `AdviseSupervisory` delivers the current value at subscription time and then fires `OnDataChange` **only when the value changes**. `ScanState` is discrete — for a healthy host, the initial advise callback reports `true` and nothing follows until the state actually changes. There is no periodic heartbeat on the subscription. + +Implications: + +- **No starvation-based Running → Stopped transition.** A Running host will legitimately go minutes or hours without an update. The stale-threshold check for the Running state is dropped entirely. +- **Error callbacks drive the Running → Stopped transition.** MxAccess delivers a data-change callback with `ItemStatus[0].success == false` and `detail == 2 (MX_E_PlatformCommunicationError)` when a host becomes unreachable. We trust this signal — it's the broker's job to surface it, and in practice it fires quickly. +- **Stale threshold only applies to the Unknown state.** If a probe is advised but never receives a first callback (initial resolution failure, host never deployed, MxAccess routing broken), the Unknown → Stopped transition fires after `UnknownResolutionTimeoutSeconds`. This catches "the probe never came online" without tripping on healthy stable hosts. + +Subscription mechanics: + +- `AdviseSupervisory` on `.ScanState`. Supervisory variant avoids user-login requirements for bridge-owned probes — matches the pattern the node manager already uses for its own subscriptions. +- Probes are bridge-owned, not ref-counted against client subscriptions. They live for the lifetime of the address space between rebuilds. +- On rebuild, the probe set is diffed against the new host list and the minimum number of `AdviseSupervisory`/`Unadvise` calls are issued (see `Sync` in the probe manager). + +### Host discovery + +Galaxy Repository already has the data — we just need to surface it to the runtime layer. + +`hierarchy.sql` currently selects every deployed object where `template_definition.category_id IN (1, 3, 4, 10, 11, 13, 17, 24, 26)`. Category `1 = $WinPlatform` and `3 = $AppEngine` are already in the set. Add `template_definition.category_id` as a new column on the query so the repository loader can tag each `GalaxyObjectInfo` with its Galaxy category, and the probe manager can filter for categories 1 and 3. + +**Schema change:** add `CategoryId: int` to `GalaxyObjectInfo`, populated from `hierarchy.sql`. Small schema change, keeps the probe enumeration aligned with whatever the rest of the address space sees at each rebuild. + +### Runtime host state machine + +``` +┌─ Unknown ─┐ (initial state; advise issued, no callback yet) +│ │ +│ │ ScanState == true +│ ▼ +│ Running ◄───────────────────┐ +│ │ │ +│ │ │ ScanState == true +│ │ ScanState != true │ (recovery callback) +│ │ (false / error / │ +│ │ bad status) │ +│ ▼ │ +│ Stopped ──────────────────────┘ +│ +└─► Stopped (Unknown → Stopped after UnknownResolutionTimeoutSeconds + if no initial callback ever arrives) +``` + +Three states: + +- **Unknown** — probe advised but no callback yet. Initial state after bridge startup or a rebuild until the first `OnDataChange` for that host. If this state persists longer than `UnknownResolutionTimeoutSeconds` (default 15s), the manager's periodic check flips it to Stopped — captures the "probe never resolved" case. +- **Running** — last probe callback delivered `ScanState = true` with `ItemStatus[0].success == true`. Stays in this state until a callback changes it. No starvation-based timeout. +- **Stopped** — any of: + 1. Last probe callback had `ScanState != true` (explicit off-scan). + 2. Last probe callback had `ItemStatus[0].success == false` (unreachable host). + 3. Unknown state timed out (initial resolution never completed). + 4. Initial `AdviseSupervisory` reported `ResolutionStatus` of `invalidReference` or `noGalaxyRepository`. + +### MxAccess transport down → force Unknown + +When the local MxAccess client is not connected (`IMxAccessClient.State != ConnectionState.Connected`), every probe's transport is effectively offline regardless of the underlying host state. The probe manager **forces every entry to Unknown** in its snapshot output while MxAccess is disconnected. Rationale: + +- Telling the operator that all hosts are `Stopped` is misleading — the actual problem is the local transport, which the existing Connection panel already surfaces prominently. +- Unknown is the right semantic: we don't know the host state because we can't see them right now. +- When MxAccess reconnects, the broker re-delivers probe subscriptions and the state machine resumes normally. + +Implementation: `GetSnapshot()` checks `_client.State` and rewrites `State = Unknown` (leaving the underlying `_stateByProbe` map intact for when the transport comes back). `HealthCheckService` already rolls to Unhealthy via the MxAccess-not-connected rule before the runtime status rule fires, so this doesn't create a confusing health-rollup story. + +### New types + +All in `src/ZB.MOM.WW.LmxOpcUa.Host/Domain/`: + +```csharp +public enum GalaxyRuntimeState { Unknown, Running, Stopped } + +public sealed class GalaxyRuntimeStatus +{ + public string ObjectName { get; set; } = ""; // gobject.tag_name + public int GobjectId { get; set; } + public string Kind { get; set; } = ""; // "$WinPlatform" or "$AppEngine" + public GalaxyRuntimeState State { get; set; } + public DateTime? LastStateCallbackTime { get; set; } // UTC of most recent probe callback + public DateTime? LastStateChangeTime { get; set; } // UTC of last Running↔Stopped transition + public bool? LastScanState { get; set; } // last ScanState value; null before first update + public string? LastError { get; set; } // MxStatus.detail description when !success + public long GoodUpdateCount { get; set; } // callbacks where ScanState == true + public long FailureCount { get; set; } // callbacks where ScanState != true or !success +} +``` + +Why two timestamps (`LastStateCallbackTime` vs `LastStateChangeTime`): on-change-only delivery means they'll match for most entries, but a callback that arrives with a different error detail while the host is already Stopped updates the callback time and `LastError` without touching `LastStateChangeTime`. The dashboard's "Since" column (see Dashboard panel) uses `LastStateChangeTime` so operators see "Stopped since 08:17:02Z" regardless of how many intervening error callbacks have refined the diagnostic detail. + +Naming note: "Galaxy runtime" is the generic term covering both `$WinPlatform` and `$AppEngine` — the dashboard and config use this neutral phrasing so the feature doesn't look like it only covers Platforms. + +### Probe manager + +New class `MxAccess/GalaxyRuntimeProbeManager.cs`, owned by `LmxNodeManager`: + +```csharp +internal sealed class GalaxyRuntimeProbeManager : IDisposable +{ + public GalaxyRuntimeProbeManager( + IMxAccessClient client, + int unknownResolutionTimeoutSeconds, + Action onHostStopped, // invoked with GobjectId on Running → Stopped + Action onHostRunning); // invoked with GobjectId on Stopped → Running + + // Called after address-space build / rebuild. Adds probes for new hosts, + // removes them for hosts no longer in the hierarchy. Idempotent. + // Caller supplies the full hierarchy; the manager filters for category_id + // 1 ($WinPlatform) and 3 ($AppEngine). + // Blocks on sequential AddItem/AdviseSupervisory SDK calls — see wiring notes. + public void Sync(IReadOnlyList hierarchy); + + // Invoked by LmxNodeManager's OnTagValueChanged callback when the address + // matches a probe tag reference. Returns true when the event was consumed + // by a probe so the data-change dispatch queue can skip it. + public bool HandleProbeUpdate(string tagRef, Vtq vtq, MxStatusProxy status); + + // Called from the MxAccess connection monitor callback (MonitorIntervalSeconds + // cadence) to advance time-based transitions: + // 1. Unknown → Stopped when UnknownResolutionTimeoutSeconds has elapsed. + // 2. Nothing for Running — no starvation check (on-change-only semantics). + public void Tick(); + + // Snapshot respects MxAccess transport state — returns all Unknown when + // the transport is disconnected, regardless of the underlying per-host state. + public IReadOnlyList GetSnapshot(); + + public int ActiveProbeCount { get; } + + // Unadvise + RemoveItem on every active probe. Called from LmxNodeManager.Dispose + // before the MxAccess client teardown. Idempotent — safe to call multiple times. + public void Dispose(); +} +``` + +The two `Action` callbacks are how the probe manager triggers the subtree quality invalidation documented below — the owning `LmxNodeManager` passes references to its own `MarkHostVariablesBadQuality` and `ClearHostVariablesBadQuality` methods at construction time. The probe manager calls them synchronously on state transitions, from whichever thread delivered the probe callback (the MxAccess dispatch thread). The node manager methods acquire their own lock internally — the probe manager does not hold its own lock across the callback invocation to avoid inverted-lock-order deadlocks. + +Internals: + +- `Dictionary` keyed by probe tag reference (`.ScanState`). +- Reverse `Dictionary` from `GobjectId` to probe tag for `Sync` to diff against a fresh hierarchy. +- One lock guarding both maps. Operations are microsecond-scale. +- `Sync` filters `hierarchy` for `CategoryId == 1 || CategoryId == 3`, then compares the filtered set against the active probe set: + - Added hosts → `client.AddItem` + `AdviseSupervisory`; insert `GalaxyRuntimeStatus { State = Unknown }`. + - Removed hosts → `Unadvise` + `RemoveItem`; drop entry. + - Unchanged hosts → leave in place, preserving their state machine across the rebuild. +- `HandleProbeUpdate` is the per-callback entry point. It evaluates the `isRunning` predicate, updates `LastUpdateTime`, transitions state, logs at `Information` level on state changes only (not every tick), and stores the `ItemStatus` detail into `LastError` on failure. +- `Tick` runs at the existing dispatch thread cadence. For each Unknown entry, checks `LastUpdateTime == null && (now - _createdAt[id]) > unknownResolutionTimeoutSeconds` and flips to Stopped if so. Healthy Running entries are not touched. +- `GetSnapshot` short-circuits to "all Unknown" when `_client.State != ConnectionState.Connected`. + +### LmxNodeManager wiring + +`LmxNodeManager` constructs a `GalaxyRuntimeProbeManager` when `MxAccessConfiguration.RuntimeStatusProbesEnabled` is true. In `BuildAddressSpace` and the subtree rebuild path, after the existing loops complete, call `_probeManager.Sync(hierarchy)`. `Sync` blocks while it issues `AddItem` + `AdviseSupervisory` sequentially for each new host — for a galaxy with ~50 runtime hosts this adds roughly 500ms–1s to the address-space build on top of the existing several-second build time. Kept synchronous deliberately: the simpler correctness model is worth the startup hit, and `ActiveProbeCount` is guaranteed to be accurate the moment the build completes. + +Route the existing `OnTagValueChanged` callback through `_probeManager.HandleProbeUpdate` first — if it returns `true`, the event was consumed by a bridge-owned probe and the dispatch queue skips the normal variable-update path. + +**Tick() cadence — piggyback on the MxAccess connection monitor.** The dispatch thread wakes on `_dataChangeSignal`, which only fires when tag values change. In the degenerate case where no probe ever resolves (MxAccess routing broken, bad probe tag, etc.), the dispatch loop never wakes and the Unknown → Stopped timeout would never fire. To avoid adding a new thread or timer, hook `_probeManager.Tick()` into the callback path that the existing `MxAccess.MonitorIntervalSeconds` watcher already runs — the same cadence that drives the connection-level probe-tag staleness check. A single call site covers both. + +If the monitor is not accessible from `LmxNodeManager` during implementation (it lives at a different layer in the MxAccess client), fall back to Option A from the design discussion: change the dispatch loop's `WaitOne()` call to a timed `WaitOne(500ms)` so it wakes periodically regardless of data changes. Single-line change, but requires verifying no assumptions in the existing loop break from the periodic wake-ups. + +### Service shutdown — explicit probe cleanup + +The probe manager's `Sync` handles Unadvise on diff removal when a host leaves the hierarchy. Service shutdown is a separate path that needs explicit handling: when `LmxNodeManager` is disposed, the active probe subscriptions must be torn down before the MxAccess client is closed — otherwise we rely on the client's broader shutdown to cover supervisory subscriptions, which depends on disposal ordering and may or may not clean up cleanly. + +`GalaxyRuntimeProbeManager` implements `IDisposable`. `Dispose()` walks the active probe map, calls `Unadvise` + `RemoveItem` on each entry, and clears the maps. Idempotent — calling it twice is a no-op. `LmxNodeManager.Dispose` calls `_probeManager?.Dispose()` **before** the existing teardown steps that touch the MxAccess client. + +### Subtree quality invalidation on Stopped transition + +**Operational context for this section** — observed behavior from production: when an AppEngine or Platform goes OffScan, MxAccess fans out per-tag `OnDataChange` callbacks for every advised tag hosted by that runtime object, each carrying bad quality. Two symptoms result: + +1. **OPC UA client freeze** — the dispatch handler processes the flood in one cycle, pushes thousands of OPC UA value-change notifications to subscribed clients in one `Publish` response, and the client visibly stalls handling the volume. +2. **Incomplete quality flip** — some OPC UA variables retain their last good value with Good quality even after the host is down, either because the dispatch queue drops updates, or because some tags aren't in the subscribed set at the moment of the flood, or because of an edge case in the quality mapper. Operationally: clients read plausible-looking stale data from a dead host. + +The probe-driven Stopped transition is the authoritative, on-time signal we control. On that transition, the bridge proactively walks every OPC UA variable node hosted by the Stopped host and sets its `StatusCode` to `BadOutOfService`. This is independent of whether MxAccess also delivers per-tag bad-quality updates — the two signals are belt-and-suspenders for correctness. Even if the dispatch queue drops half the per-tag updates, the subtree walk guarantees the end state is uniformly Bad for every variable under the dead host. + +On the recovery `Stopped → Running` transition, the bridge walks the same set and clears the override — sets `StatusCode` back to `Good` so the cached values are visible again. Subsequent real MxAccess updates arrive on-change and overwrite value + status as normal. Trade-off: for a host that's been down a long time, some tags may show Good quality on a stale cached value for a short window after recovery, until MxAccess delivers the next on-change update for that tag. This matches existing bridge behavior for any slow-changing attribute and is preferable to leaving variables stuck at BadOutOfService indefinitely waiting for an update that may never come. + +**What's included in the "subtree"** — the set of variables whose owning Galaxy object is hosted (transitively) by the Stopped host. For AppEngines, this is every variable whose object's `host_gobject_id` chain reaches the Engine. For Platforms, it's every variable on every Engine hosted by the Platform, plus every object hosted directly on the Platform. This is **not** browse-tree containment — an object can live in one Area (browse parent) but be hosted by an Engine on a different Platform (runtime parent), and the host relationship is what determines the fate of its live data. + +Implementation plan for the host-to-variables mapping: + +1. Extend `hierarchy.sql` to return `gobject.host_gobject_id` as a new column if it exists. Verify during implementation — if the column is not present on this Galaxy schema version, fall back to `contained_by_gobject_id` as an approximation (less precise for edge cases where browse containment differs from runtime hosting, but sufficient for typical Galaxy topologies). +2. Extend `GalaxyObjectInfo` with `HostGobjectId: int`. +3. During `BuildAddressSpace`, as each variable is created, compute its owning host by walking `HostGobjectId` up the chain until hitting a `$WinPlatform` or `$AppEngine` (or reaching the root). Append the variable to a `Dictionary>` keyed by the host's `GobjectId`. +4. On `BuildSubtree` (incremental rebuild), the same logic runs for newly added variables. Variables that leave the hierarchy are removed from the map. The map lives next to `_nodeMap` on `LmxNodeManager`. + +New public methods on `LmxNodeManager`: + +```csharp +// Called by probe manager on Running → Stopped. Walks every variable hosted by +// gobjectId and sets its StatusCode to BadOutOfService. Safe to call multiple times. +// Does nothing when gobjectId has no hosted variables. +public void MarkHostVariablesBadQuality(int gobjectId); + +// Called by probe manager on Stopped → Running. Walks every variable hosted by +// gobjectId and resets StatusCode to Good. Values are left at whatever the last +// MxAccess-delivered value was; subsequent on-change updates will refresh them. +public void ClearHostVariablesBadQuality(int gobjectId); +``` + +Both methods acquire the standard node manager `Lock`, iterate the hosted list, set `StatusCode` + call `ClearChangeMasks(ctx, false)` per variable, and release the lock. The OPC UA subscription publisher picks up the change masks on its next tick and pushes notifications to subscribed clients — so operators see a single uniform quality flip per variable rather than two (one from our walk, one from the MxAccess per-tag delivery). + +### Dispatch suppression — deferred pending observation + +The subtree invalidation above addresses the **data-correctness** symptom (some variables not flipping to bad quality). The **client freeze** symptom is a separate problem: even if the quality state is correct, the bridge is still processing a thundering herd of per-tag bad-quality MxAccess callbacks through the dispatch queue, which in turn push thousands of OPC UA value-change notifications to subscribed clients. + +A stronger fix would be **dispatch suppression**: once the probe manager transitions a host to Stopped, filter out incoming MxAccess per-tag updates for any tag owned by that host before they hit the dispatch queue. The subtree walk has already captured the state; the redundant per-tag updates are pure noise. + +**This is deliberately NOT part of phase 1.** Reasons: + +- The subtree walk may make the freeze disappear entirely. If the dispatch queue processes the flood but the notifications it pushes are now duplicates of change masks the walk already set, the SDK may coalesce them into a single publish cycle and the client sees one notification batch rather than thousands. We want to observe whether this is the case before building suppression. +- If the freeze persists after subtree invalidation ships, we have a real measurement of the residual problem to inform the suppression design (which hosts, which tags, how much batching, whether to also coalesce at the OPC UA publisher level). +- The suppression path has a subtle failure mode: if the probe is briefly wrong (race where the probe says Stopped but the host actually recovered), we'd drop legitimate updates for a few seconds until the probe catches up. For an on-change-only probe this is bounded, but the plan should justify the trade-off against real observed data. + +Phase 2 decision gate: after shipping phase 1 and observing the post-subtree-walk behavior against a real AppEngine stop, decide whether dispatch suppression is still needed and design it against the real measurement. + +### OPC UA address space exposure + +Per-host status should be readable by OPC UA clients, not just the dashboard. Add child variable nodes under each `$WinPlatform` / `$AppEngine` object node in the address space. **All bridge-synthetic nodes use a `$` prefix** so they can never collide with user-defined attributes on extended templates: + +- `.$RuntimeState` (`String`) — `Unknown` / `Running` / `Stopped`. +- `.$LastCallbackTime` (`DateTime`) — most recent probe callback regardless of transition. +- `.$LastScanState` (`Boolean`) — last `ScanState` value received; null before first update. +- `.$LastStateChangeTime` (`DateTime`) — most recent Running↔Stopped transition, backs the dashboard "Since" column. +- `.$FailureCount` (`Int64`) +- `.$LastError` (`String`) — last non-success MxStatus detail, empty string when null. + +These read from the probe manager's snapshot (bridge-synthetic, no MxAccess round-trip) and are updated via `ChangeBits.Value` signalling when the state transitions. Read-only. + +Note: the underlying `.ScanState` Galaxy attribute will already appear in the address space via the normal hierarchy-build path, so downstream clients will see both the raw attribute (`ns=3;s=DevPlatform.ScanState`) and the synthesized state rollup (`ns=3;s=DevPlatform.$RuntimeState`). Intentional — the raw attribute is the ground truth, the rollup adds state-change timestamps and the Unknown/Running/Stopped trichotomy. + +Namespace placement: under the existing host object node in the Galaxy namespace (`ns=3`), browseable at `DevPlatform/$RuntimeState` etc. No new namespace needed. + +### Dashboard + +#### Runtime Status panel + +New `RuntimeStatusInfo` class on `StatusData`: + +```csharp +public class RuntimeStatusInfo +{ + public int Total { get; set; } + public int RunningCount { get; set; } + public int StoppedCount { get; set; } + public int UnknownCount { get; set; } + public List Hosts { get; set; } = new(); +} +``` + +Populated in `StatusReportService` via a new `LmxNodeManager.RuntimeStatuses` accessor. Renders between the Galaxy Info panel and the Historian panel. + +Panel color: +- **Green** — all hosts Running. +- **Yellow** — at least one Unknown, zero Stopped. +- **Red** — at least one Stopped. +- **Gray** — MxAccess disconnected (all hosts Unknown; the Connection panel is the primary signal). + +HTML layout: +``` +┌ Galaxy Runtime ───────────────────────────────────────────────────────┐ +│ 5 of 6 hosts running (3 platforms, 3 engines) │ +│ ┌─────────────────┬──────────────┬─────────┬──────────────────────┐ │ +│ │ Name │ Kind │ State │ Since │ │ +│ ├─────────────────┼──────────────┼─────────┼──────────────────────┤ │ +│ │ DevPlatform │ $WinPlatform │ Running │ 2026-04-13T08:15:02Z │ │ +│ │ DevAppEngine │ $AppEngine │ Running │ 2026-04-13T08:15:04Z │ │ +│ │ PlatformA │ $WinPlatform │ Running │ 2026-04-13T08:15:03Z │ │ +│ │ EngineA_1 │ $AppEngine │ Running │ 2026-04-13T08:15:05Z │ │ +│ │ EngineA_2 │ $AppEngine │ Stopped │ 2026-04-13T14:28:03Z │ │ +│ │ PlatformB │ $WinPlatform │ Running │ 2026-04-13T08:15:04Z │ │ +│ └─────────────────┴──────────────┴─────────┴──────────────────────┘ │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +The "Since" column backs on `LastStateChangeTime` and its meaning depends on the row's current state: "Running since X" reads as "has been on scan since X", "Stopped since X" reads as "has been off scan since X". For Unknown rows, display "Advised since X" instead (the probe was registered at X but has not yet received its first callback). + +#### Subscriptions panel — break out bridge probe count + +The existing Subscriptions panel shows `Active: N` — the total advised-item count from `IMxAccessClient.ActiveSubscriptionCount`. After this ships, that number will include the bridge-owned runtime probes (one per Platform + one per AppEngine), which would look like a silent jump to operators watching for capacity planning purposes. + +Fix: expose a new `ActiveProbeSubscriptionCount` property on `LmxNodeManager` (wired from `GalaxyRuntimeProbeManager.ActiveProbeCount`) and render as a second line on the Subscriptions panel: + +``` +┌ Subscriptions ──────────────────────────────┐ +│ Active: 1247 │ +│ Probes: 6 (bridge-owned runtime status) │ +└──────────────────────────────────────────────┘ +``` + +The `Active` total continues to include probes (no subtraction) so the count still matches whatever MxAccess actually holds — the breakout line tells operators which slice is bridge-internal. + +### HealthCheckService rule + +New rule in `HealthCheckService.CheckHealth`: + +``` +Rule 2e: Any Galaxy runtime host in Stopped state → Degraded + - Yellow panel + - Message: "N of M hosts stopped: Host1, Host2" +``` + +Rationale: the bridge is still able to talk to the local MxAccess runtime and serve cached values for the hosts that are up, so this is `Degraded` rather than `Unhealthy`. A stopped host is recoverable — the operator fixes it and the probe automatically transitions back to `Running`. + +Rule ordering matters: this rule checks after the MxAccess-connected check (Rule 1), so when MxAccess is disconnected the service is Unhealthy on Rule 1 and the runtime-host rule never runs — avoids the confusing "MxAccess down AND Galaxy runtime degraded" double message. + +### Configuration + +New fields on `MxAccessConfiguration` (not a new config class — this is a runtime concern of the MxAccess bridge): + +```csharp +public class MxAccessConfiguration +{ + // ...existing fields... + + /// + /// Enables per-host runtime status probing via AdviseSupervisory on + /// <ObjectName>.ScanState for every deployed $WinPlatform + /// and $AppEngine. Default enabled when a deployed ArchestrA Platform + /// is present. Set false for bridges that don't need multi-host + /// visibility and want to minimize subscription count. + /// + public bool RuntimeStatusProbesEnabled { get; set; } = true; + + /// + /// Maximum seconds to wait for the initial probe callback before marking + /// an Unknown host as Stopped. Only applies to the Unknown → Stopped + /// transition; Running hosts do not time out (ScanState is delivered + /// on-change only, so a stable healthy host may go indefinitely without + /// a callback). Default 15s. + /// + public int RuntimeStatusUnknownTimeoutSeconds { get; set; } = 15; +} +``` + +No new top-level config section. Validator emits a warning if the timeout is shorter than 5 seconds (below the reasonable floor for MxAccess initial-resolution latency). + +## Critical Files + +### Modified +- `src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyObjectInfo.cs` — add `CategoryId: int` and `HostGobjectId: int` +- `src/ZB.MOM.WW.LmxOpcUa.Host/GalaxyRepository/GalaxyRepositoryService.cs` — include `template_definition.category_id` and `gobject.host_gobject_id` in `HierarchySql` and the reader (falling back to `contained_by_gobject_id` if host column is unavailable) +- `gr/queries/hierarchy.sql` — same column additions (documentation query) +- `src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/MxAccessConfiguration.cs` — add `RuntimeStatusProbesEnabled` + `RuntimeStatusUnknownTimeoutSeconds` +- `src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxNodeManager.cs` — construct probe manager, wire `OnTagValueChanged` and the MxAccess monitor callback, build `_hostedVariables: Dictionary>` during address-space construction, expose `RuntimeStatuses` / `ActiveProbeSubscriptionCount` / `MarkHostVariablesBadQuality` / `ClearHostVariablesBadQuality` +- `src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusData.cs` — add `RuntimeStatusInfo`; add `ProbeSubscriptionCount` field on `SubscriptionInfo` +- `src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusReportService.cs` — populate from node manager, render Runtime Status panel + Probes line +- `src/ZB.MOM.WW.LmxOpcUa.Host/Status/HealthCheckService.cs` — new Rule 2e (after Rule 1 to avoid double-messaging when MxAccess is down) +- `src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json` — new MxAccess fields with defaults +- `src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/ConfigurationValidator.cs` — timeout floor warning +- `docs/MxAccessBridge.md` — document the probe pattern and on-change semantics +- `docs/StatusDashboard.md` — add `RuntimeStatusInfo` field table and Probes line +- `docs/Configuration.md` — add the two new MxAccess fields + +### New +- `src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs` +- `src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs` +- `src/ZB.MOM.WW.LmxOpcUa.Host/MxAccess/GalaxyRuntimeProbeManager.cs` +- `tests/ZB.MOM.WW.LmxOpcUa.Tests/MxAccess/GalaxyRuntimeProbeManagerTests.cs` + +## Execution order + +1. **DTO + enum** — `GalaxyRuntimeState`, `GalaxyRuntimeStatus`. +2. **Hierarchy schema** — add `CategoryId` to `GalaxyObjectInfo`, extend `HierarchySql` to select `td.category_id` as a new column, update `GalaxyRepositoryService` reader. +3. **Config** — add the two new `MxAccessConfiguration` fields and validator rule. +4. **Probe manager class + unit tests (TDD)** — write `GalaxyRuntimeProbeManagerTests.cs` first. Fake `IMxAccessClient` with scripted `OnTagValueChanged` invocations, configurable `State`, and a fake clock. Exercise the full matrix in the test plan below. +5. **Ship tests green before touching node manager.** +6. **Host-to-variables mapping in node manager** — add `_hostedVariables: Dictionary>` populated during `BuildAddressSpace`. For each variable node, walk its owning object's `HostGobjectId` chain up to the nearest `$WinPlatform` or `$AppEngine` and append to that host's list. On rebuild (`BuildSubtree`), incrementally maintain the map. Expose `MarkHostVariablesBadQuality(int gobjectId)` and `ClearHostVariablesBadQuality(int gobjectId)` public methods that take the node manager `Lock`, iterate the hosted list, set/clear `StatusCode`, and call `ClearChangeMasks(ctx, false)` per variable. +7. **Node manager wiring** — construct `GalaxyRuntimeProbeManager`, pass `MarkHostVariablesBadQuality` / `ClearHostVariablesBadQuality` as its `onHostStopped` / `onHostRunning` callbacks, call `Sync` after `BuildAddressSpace` / rebuild, route `OnTagValueChanged` through `HandleProbeUpdate`, hook `Tick()` into the MxAccess connection-monitor callback path (fall back to timed `WaitOne(500ms)` on the dispatch loop if the monitor isn't reachable from the node manager). Add `RuntimeStatuses` and `ActiveProbeSubscriptionCount` accessors. Call `_probeManager?.Dispose()` from `LmxNodeManager.Dispose` **before** the existing MxAccess client teardown steps. +8. **OPC UA synthetic nodes** — under each `$WinPlatform` and `$AppEngine` node in BuildAddressSpace, add the six `$`-prefixed variables backed by lambdas that read from the probe manager snapshot. +9. **Dashboard** — `RuntimeStatusInfo` on `StatusData`, `BuildRuntimeStatusInfo` in `StatusReportService`, render Runtime Status panel, add Probes line to Subscriptions panel. Status tests asserting both. +10. **Health check** — new Rule 2e with test: Degraded when any host is stopped, message names the stopped hosts. +11. **Integration tests** — `LmxNodeManagerBuildTests` additions with a fake repository containing mixed `$WinPlatform` and `$AppEngine` hierarchy entries; verify `Sync` is called, synthetic nodes are created on both host types, `_hostedVariables` map is populated, and `MarkHostVariablesBadQuality` / `ClearHostVariablesBadQuality` flip status codes on the correct subset. +12. **Docs** — `MxAccessBridge.md`, `StatusDashboard.md`, `Configuration.md`. +13. **Deploy** — backup, deploy both instances, verify via dashboard. +14. **Live verification** — see Verification section below. + +## Test plan + +### `GalaxyRuntimeProbeManagerTests.cs` — unit tests with fake client + fake clock + +**State transitions** +- Fresh manager → empty snapshot. +- `Sync` with one Platform + one Engine → snapshot contains two entries in `Unknown`, `Kind` set correctly. +- First `ScanState = true` update → Unknown → Running, `LastUpdateTime` and `LastScanState = true` set, `GoodUpdateCount == 1`. +- Second `ScanState = true` update → still Running, counter increments. +- `ScanState = false` update → Running → Stopped, `LastScanState = false`, `FailureCount == 1`. +- `ItemStatus[0].success = false, detail = 2` update → Running → Stopped, `LastError` contains `MX_E_PlatformCommunicationError`. +- Null value delivered → Running → Stopped defensively, `LastError` explains null-value rejection. +- Recovery `ScanState = true` after Stopped → Stopped → Running, `LastStateChangeTime` updated, `LastError` cleared. +- Platform and AppEngine transitions behave identically (parameterized test). + +**Unknown resolution timeout** +- No callback + clock advances past timeout → Unknown → Stopped. +- Good update just before timeout → Unknown → Running (no subsequent Stopped). +- Good update after timeout already flipped Unknown → Stopped → Stopped → Running (recovery path still works). +- `Tick` on a Running entry with no recent update → still Running (no starvation check — this is the critical on-change-semantic guarantee). + +**MxAccess transport gating** +- Client `State = Disconnected` → `GetSnapshot` returns all entries with `State = Unknown` regardless of underlying state. +- Client flips Connected → Disconnected → underlying state preserved internally; snapshot reports Unknown. +- Client flips Disconnected → Connected → snapshot reflects underlying state again. +- Incoming `HandleProbeUpdate` while client is Disconnected → still updates the underlying state machine (so the snapshot is correct when transport comes back). + +**Sync diff behavior** +- Sync with new Platform → Advise called once, counter = 1. +- Sync with new Engine → Advise called once, counter = 1. +- Sync twice with same hosts → Advise called once total (idempotent on unchanged entries). +- Sync then Sync with a Platform removed → Unadvise called, snapshot loses entry. +- Sync with different host set → Advise for new, Unadvise for old, unchanged preserved. +- Sync filters out non-runtime categories (areas, user objects) — hierarchy with 10 mixed categories and 2 runtime hosts produces exactly 2 probes. + +**Event routing** +- `HandleProbeUpdate(probeAddr, ...)` → returns `true`, updates state. +- `HandleProbeUpdate(nonProbeAddr, ...)` → returns `false`, no state change. +- Concurrent `Sync` + `HandleProbeUpdate` under lock → no corruption (thread-safety smoke test). +- Callback arriving after Sync removed the entry → `HandleProbeUpdate` returns false (entry not found), no crash. + +**Counters** +- `ActiveProbeCount == 2` after Sync with 1 Platform + 1 Engine. +- `ActiveProbeCount` decrements when a host is removed via Sync. +- `ActiveProbeCount == 0` on a fresh manager with no Sync called yet. + +**Dispose** +- Dispose on a fresh manager → no-op, no Unadvise calls on the fake client. +- Dispose after Sync with 3 hosts → 3 Unadvise + 3 RemoveItem calls on the fake client. +- Dispose twice → second call is idempotent, no extra Unadvise calls. +- HandleProbeUpdate after Dispose → returns false defensively (no crash, no state change). +- Sync after Dispose → no-op or throws ObjectDisposedException (pick one; test documents whichever is chosen). + +**Subtree invalidation callbacks** +- Construct probe manager with spy callbacks tracking `(gobjectId, kind)` tuples for each call. +- Running → Stopped transition → `onHostStopped` invoked exactly once with the correct GobjectId, `onHostRunning` never called. +- Stopped → Running transition → `onHostRunning` invoked exactly once with the correct GobjectId, `onHostStopped` never called. +- Unknown → Running (initial callback) → no invocation of either callback (only Running↔Stopped transitions trigger them, not fresh Unknown→Running). +- Unknown → Stopped (via timeout) → `onHostStopped` invoked once. +- Multiple consecutive callbacks with `ScanState=true` while already Running → no extra `onHostRunning` invocations. +- Multiple consecutive error callbacks while already Stopped → no extra `onHostStopped` invocations. +- Callback throws exception → probe manager logs a warning, updates its internal state regardless, does not propagate. + +### `LmxNodeManagerBuildTests` additions + +- Build address space with a `$WinPlatform` in the fake hierarchy → probe manager receives a `Sync` call with one entry. +- Build address space with a mix (1 Platform + 2 AppEngines + 5 user objects) → probe manager Sync receives exactly 3 runtime hosts. +- Build + rebuild with different host set → probe manager's `Sync` called twice with correct diff. +- Address space contains synthetic `$RuntimeState` variable under each host object node. +- `ActiveProbeSubscriptionCount` reflects probe count after build. + +### Host-to-variables mapping + subtree invalidation tests + +- Build address space with 1 `$AppEngine` hosting 2 user objects with 3 attributes each → `_hostedVariables[engineId]` contains 6 variable nodes. +- Build address space with 1 `$WinPlatform` hosting 2 `$AppEngine`s, each hosting 3 user objects with 2 attributes each → `_hostedVariables[platformId]` contains the 2 Engine nodes + 12 attribute variables; `_hostedVariables[engineId]` contains its 6 attribute variables. (Platform and Engine entries both exist; a single variable can appear in both lists.) +- Rebuild with a different set → the map is rebuilt from scratch; old entries are released. +- `MarkHostVariablesBadQuality(engineId)` → every variable in `_hostedVariables[engineId]` has `StatusCode = BadOutOfService` after the call; variables hosted by other engines are unchanged. +- `ClearHostVariablesBadQuality(engineId)` → every variable in that host's list has `StatusCode = Good` after the call. +- `MarkHostVariablesBadQuality` on a GobjectId with no entry in the map → no-op, no crash. +- `MarkHostVariablesBadQuality` followed by a fresh MxAccess update on one of the variables → the update's Value + Status overwrites the forced Bad (confirms no "override layer" confusion; the simple StatusCode set is naturally overwritten by the normal dispatch path). +- `MarkHostVariablesBadQuality` acquires the node manager `Lock` (verify no deadlock when called from a thread that also needs the lock). + +### End-to-end subtree invalidation integration test + +- Fake repository with 1 Engine hosting 10 attributes. All on advise. All have some recent value with Good status. +- Simulate probe callback delivering `ScanState = false` for the Engine → probe manager flips to Stopped, invokes `onHostStopped`, which in turn walks the 10 variables and flips them to BadOutOfService. +- Assert all 10 variables now report StatusCode = BadOutOfService after a `client.Read` round-trip. +- Simulate probe callback delivering `ScanState = true` again → probe manager flips to Running, `onHostRunning` clears the override, all 10 variables now report StatusCode = Good. + +### `StatusReportServiceTests` additions + +- HTML contains `

Galaxy Runtime

` when at least one runtime host is present. +- HTML rendering distinguishes `$WinPlatform` and `$AppEngine` rows in the Kind column. +- JSON exposes `RuntimeStatus.Total`, `RuntimeStatus.RunningCount`, `RuntimeStatus.StoppedCount`, `RuntimeStatus.Hosts[]`. +- Subscriptions panel HTML contains a `Probes:` line when `ProbeSubscriptionCount > 0`. +- No Runtime Status panel when the fake repository has zero runtime hosts. +- When fake MxAccess client is `Disconnected`, all host rows render `Unknown` regardless of state passed in. + +### `HealthCheckServiceTests` additions + +- All hosts running → Healthy. +- One host stopped → Degraded, message mentions the stopped host name. +- All hosts stopped → Degraded (not Unhealthy — cached values still served). +- MxAccess disconnected + one host stopped → Unhealthy via Rule 1 (runtime status rule doesn't fire). + +## Verification + +1. `dotnet build` clean on both Host and plugin. +2. `dotnet test tests/ZB.MOM.WW.LmxOpcUa.Tests --filter "FullyQualifiedName~GalaxyRuntimeProbe|FullyQualifiedName~Status|FullyQualifiedName~HealthCheck"` → all pass. +3. Deploy to instance1 (default `RuntimeStatusProbesEnabled: true`, `RuntimeStatusUnknownTimeoutSeconds: 15`). Dashboard shows `Galaxy Runtime: 2 of 2 hosts running` (DevPlatform + DevAppEngine) immediately after startup, all green. Subscriptions panel shows `Probes: 2`. +4. Stop `DevAppEngine` from the IDE (SMC `SetToOffScan` or the engine's stop action, leaving its parent Platform running). Verify: + - Dashboard panel turns red within ~1s of the action. + - DevAppEngine row shows `Stopped` with the last good timestamp. + - DevPlatform row remains `Running` — confirms the engines are independently observable. + - Overall Health rolls up to `Degraded`. + - CLI `read ns=3;s=DevAppEngine.$RuntimeState` returns `"Stopped"`. + - Log has an `Information` line "Galaxy runtime DevAppEngine ($AppEngine) transitioned Running → Stopped". + - **Subtree invalidation**: CLI `read ns=3;s=TestMachine_001.MachineID` and any other tag under an object hosted by DevAppEngine returns status code `BadOutOfService` (or whatever specific code the Mark method uses). Every descendant tag, not just a sample — sweep-test via a browse + read across the whole address space. Operators also observe this on the dashboard Alarms / subscribed-variable reads if they're watching any particular value. + - **Client-freeze observation**: subscribe an OPC UA client to a handful of variables under DevAppEngine before step 4, then trigger the stop. Note whether the client handles the resulting notification batch cleanly (ideal) or visibly stalls (residual problem that dispatch suppression would need to address in phase 2). Document the observed behavior in the phase-2 decision gate for dispatch suppression. +5. Start `DevAppEngine` again (`SetToOnScan`). Verify: + - Dashboard flips back to green within ~1s. + - CLI read of `$RuntimeState` returns `"Running"`. + - Log has a "Galaxy runtime DevAppEngine ($AppEngine) transitioned Stopped → Running" line. + - **Subtree recovery**: descendant tags previously showing `BadOutOfService` now show `Good` status. Values may initially be stale (whatever was cached at stop time) until fresh on-change MxAccess updates arrive; this matches the design trade-off documented in the Subtree Quality Invalidation section. +6. Stop `DevPlatform` entirely (full platform stop). Verify: + - Both DevPlatform and DevAppEngine flip to `Stopped` (the Platform takes the Engine down with it). + - Log records both transitions. + - CLI reads of `$RuntimeState` for both hosts return `"Stopped"`. + - The underlying raw `ScanState` attribute reads may return BadCommunicationError — operator sees the distinction between the cached rollup and the live raw attribute. +7. Simulate MxAccess transport loss — e.g., stop the ArchestrA runtime on the local node or kill the probe connection. Verify: + - Every host row in the Runtime Status panel renders `Unknown` (not Stopped) while the Connection panel reports `Disconnected`. + - Overall Health is `Unhealthy` via Rule 1, NOT `Degraded` via Rule 2e (the rules should not double-message). + - After MxAccess reconnects, the runtime rows revert to their actual underlying states. +8. Deploy to instance2 with same config. Both instances should show consistent state since they observe the same local ArchestrA runtime. +9. Smoke-test: disable probes via `RuntimeStatusProbesEnabled: false`, restart, verify Runtime Status panel absent from HTML, `Probes:` line absent from Subscriptions panel, no probe subscriptions advised (log and `ActiveSubscriptionCount` delta) — backward compatibility path for deployments that don't want the feature. + +10. **Unresolvable-probe-tag behavior verification** — temporarily add a bogus tag to the probe set to discover how MxAccess surfaces resolution failures. The simplest way is to force the probe manager to advise a made-up `NoSuchPlatform_999.ScanState` reference during a test boot, then observe: + - Does MxAccess deliver a data-change callback with `ItemStatus[0].success = false` and a resolution-failure detail? If yes, the host row transitions Unknown → Stopped within ~1s via the error-callback path, and `LastError` carries the detail. Tighten the plan's language to say "MxAccess surfaces resolution failures as error callbacks" and optionally tighten `RuntimeStatusUnknownTimeoutSeconds` downward. + - Or does MxAccess silently drop the advise with no callback at all? If yes, the bogus host stays Unknown until `RuntimeStatusUnknownTimeoutSeconds` elapses, then flips to Stopped via the Unknown-timeout backstop. Tighten the plan's language to say "MxAccess does not surface resolution failures; the Unknown-timeout is the only detection path" and leave the default timeout as-is. + - Document the observed behavior in `docs/MxAccessBridge.md` alongside the probe pattern section so operators know which detection path their deployment relies on. + - Remove the bogus tag and restart before handing over. + +## Open questions (phase 2/3 scope — not blocking phase 1) + +1. **Dispatch suppression for Stopped hosts** (phase 2 decision gate) — once phase 1 ships with subtree invalidation, observe whether the client-freeze symptom persists. If it does, design dispatch suppression: filter MxAccess per-tag updates before they hit the dispatch queue when the owning host is Stopped. Requires a `tagRef → owning-host GobjectId` map (which `_hostedVariables` already implies, inverted). Trade-off is dropping legitimate updates during brief probe/reality mismatch windows. Decide after real measurement. + +2. **Should the probe manager expose transition events?** Synthetic OPC UA event notifier on each host object that fires when `$RuntimeState` transitions. Phase 2 stretch — operators get per-host polling via the dashboard panel today; events would let clients subscribe without polling. + +3. **Multi-node Galaxies** — Platform on a remote node shows up in the hierarchy but probes fire through the local MxAccess runtime's node. The probe semantics should still work because MxAccess routes inter-Platform queries transparently, but worth confirming during step 4 if the environment has a multi-node Galaxy. + +4. **Is `ScanState` writable?** Some Galaxy system attributes are writable via MxAccess (SetScan method on the object) which would let an operator start/stop a host through the OPC UA bridge. Phase 3 possibility — would require a gating security classification since it's a runtime control action, not a data write. diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/ConfigurationValidator.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/ConfigurationValidator.cs index d2dcf19..6002612 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/ConfigurationValidator.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/ConfigurationValidator.cs @@ -73,6 +73,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration "MxAccess.MonitorInterval={MonitorInterval}s, AutoReconnect={AutoReconnect}, ProbeTag={ProbeTag}, ProbeStaleThreshold={ProbeStale}s", config.MxAccess.MonitorIntervalSeconds, config.MxAccess.AutoReconnect, config.MxAccess.ProbeTag ?? "(none)", config.MxAccess.ProbeStaleThresholdSeconds); + Log.Information( + "MxAccess.RuntimeStatusProbesEnabled={Enabled}, RuntimeStatusUnknownTimeoutSeconds={Timeout}s", + config.MxAccess.RuntimeStatusProbesEnabled, config.MxAccess.RuntimeStatusUnknownTimeoutSeconds); if (string.IsNullOrWhiteSpace(config.MxAccess.ClientName)) { @@ -80,6 +83,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration valid = false; } + if (config.MxAccess.RuntimeStatusUnknownTimeoutSeconds < 5) + Log.Warning( + "MxAccess.RuntimeStatusUnknownTimeoutSeconds={Timeout} is below the recommended floor of 5s; initial probe resolution may time out before MxAccess has delivered the first callback", + config.MxAccess.RuntimeStatusUnknownTimeoutSeconds); + // Galaxy Repository Log.Information( "GalaxyRepository.ConnectionString={ConnectionString}, ChangeDetectionInterval={ChangeInterval}s, CommandTimeout={CmdTimeout}s, ExtendedAttributes={ExtendedAttributes}", diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/MxAccessConfiguration.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/MxAccessConfiguration.cs index a8da486..db1ba77 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/MxAccessConfiguration.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Configuration/MxAccessConfiguration.cs @@ -55,5 +55,22 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Configuration /// Gets or sets the number of seconds a probe value may remain unchanged before the connection is considered stale. /// public int ProbeStaleThresholdSeconds { get; set; } = 60; + + /// + /// Gets or sets a value indicating whether the bridge advises <ObjectName>.ScanState for every + /// deployed $WinPlatform and $AppEngine, reporting per-host runtime state on the status + /// dashboard and proactively invalidating OPC UA variable quality when a host transitions to Stopped. + /// Enabled by default. Disable to return to legacy behavior where host runtime state is invisible and + /// MxAccess's per-tag bad-quality fan-out is the only stop signal. + /// + public bool RuntimeStatusProbesEnabled { get; set; } = true; + + /// + /// Gets or sets the maximum seconds to wait for the initial probe callback before marking a host as + /// Stopped. Only applies to the Unknown → Stopped transition. Because ScanState is delivered + /// on-change only, a stably Running host does not time out — no starvation check runs on Running + /// entries. Default 15s. + /// + public int RuntimeStatusUnknownTimeoutSeconds { get; set; } = 15; } } \ No newline at end of file diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyObjectInfo.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyObjectInfo.cs index 60e6f77..82884e9 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyObjectInfo.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyObjectInfo.cs @@ -44,5 +44,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Domain /// to decide whether an object's alarms should be monitored. /// public List TemplateChain { get; set; } = new(); + + /// + /// Gets or sets the Galaxy template category id for this object. Category 1 is $WinPlatform, + /// 3 is $AppEngine, 13 is $Area, 10 is $UserDefined, and so on. Populated from + /// template_definition.category_id by hierarchy.sql and consumed by the runtime + /// status probe manager to identify hosts that should receive a ScanState probe. + /// + public int CategoryId { get; set; } + + /// + /// Gets or sets the Galaxy object id of this object's runtime host, populated from + /// gobject.hosted_by_gobject_id. Walk this chain upward to find the nearest + /// $WinPlatform or $AppEngine ancestor for subtree quality invalidation when + /// a runtime host is reported Stopped. Zero for root objects that have no host. + /// + public int HostedByGobjectId { get; set; } } } \ No newline at end of file diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs new file mode 100644 index 0000000..60096d0 --- /dev/null +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeState.cs @@ -0,0 +1,29 @@ +namespace ZB.MOM.WW.LmxOpcUa.Host.Domain +{ + /// + /// Runtime state of a deployed Galaxy runtime host ($WinPlatform or $AppEngine) as + /// observed by the bridge via its ScanState probe. + /// + public enum GalaxyRuntimeState + { + /// + /// Probe advised but no callback received yet. Transitions to + /// on the first successful ScanState = true callback, or to + /// once the unknown-resolution timeout elapses. + /// + Unknown, + + /// + /// Last probe callback reported ScanState = true with a successful item status. + /// The host is on scan and executing. + /// + Running, + + /// + /// Last probe callback reported ScanState != true, or a failed item status, or + /// the initial probe never resolved before the unknown timeout elapsed. The host is + /// off scan or unreachable. + /// + Stopped + } +} diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs new file mode 100644 index 0000000..5654e74 --- /dev/null +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Domain/GalaxyRuntimeStatus.cs @@ -0,0 +1,72 @@ +using System; + +namespace ZB.MOM.WW.LmxOpcUa.Host.Domain +{ + /// + /// Point-in-time runtime state of a single Galaxy runtime host ($WinPlatform or $AppEngine) + /// as tracked by the GalaxyRuntimeProbeManager. Surfaced on the status dashboard and + /// consumed by HealthCheckService so operators can detect a stopped host before + /// downstream clients notice the stale data. + /// + public sealed class GalaxyRuntimeStatus + { + /// + /// Gets or sets the Galaxy tag_name of the host (e.g., DevPlatform or + /// DevAppEngine). + /// + public string ObjectName { get; set; } = ""; + + /// + /// Gets or sets the Galaxy gobject_id of the host. + /// + public int GobjectId { get; set; } + + /// + /// Gets or sets the Galaxy template category name — $WinPlatform or + /// $AppEngine. Used by the dashboard to group hosts by kind. + /// + public string Kind { get; set; } = ""; + + /// + /// Gets or sets the current runtime state. + /// + public GalaxyRuntimeState State { get; set; } + + /// + /// Gets or sets the UTC timestamp of the most recent probe callback, whether it + /// reported success or failure. before the first callback. + /// + public DateTime? LastStateCallbackTime { get; set; } + + /// + /// Gets or sets the UTC timestamp of the most recent transition. + /// Backs the dashboard "Since" column. in the initial Unknown + /// state before any transition. + /// + public DateTime? LastStateChangeTime { get; set; } + + /// + /// Gets or sets the last ScanState value received from the probe, or + /// before the first update or when the last callback carried + /// a non-success item status (no value delivered). + /// + public bool? LastScanState { get; set; } + + /// + /// Gets or sets the detail message from the most recent failure callback, cleared on + /// the next successful ScanState = true delivery. + /// + public string? LastError { get; set; } + + /// + /// Gets or sets the cumulative number of callbacks where ScanState = true. + /// + public long GoodUpdateCount { get; set; } + + /// + /// Gets or sets the cumulative number of callbacks where ScanState != true + /// or the item status reported failure. + /// + public long FailureCount { get; set; } + } +} diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/GalaxyRepository/GalaxyRepositoryService.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/GalaxyRepository/GalaxyRepositoryService.cs index f45797a..a50fad6 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/GalaxyRepository/GalaxyRepositoryService.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/GalaxyRepository/GalaxyRepositoryService.cs @@ -50,7 +50,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository while (await reader.ReadAsync(ct)) { - var templateChainRaw = reader.IsDBNull(6) ? "" : reader.GetString(6); + var templateChainRaw = reader.IsDBNull(8) ? "" : reader.GetString(8); var templateChain = string.IsNullOrEmpty(templateChainRaw) ? new List() : templateChainRaw.Split(new[] { '|' }, StringSplitOptions.RemoveEmptyEntries) @@ -66,6 +66,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.GalaxyRepository BrowseName = reader.GetString(3), ParentGobjectId = Convert.ToInt32(reader.GetValue(4)), IsArea = Convert.ToInt32(reader.GetValue(5)) == 1, + CategoryId = Convert.ToInt32(reader.GetValue(6)), + HostedByGobjectId = Convert.ToInt32(reader.GetValue(7)), TemplateChain = templateChain }); } @@ -234,6 +236,8 @@ SELECT DISTINCT THEN 1 ELSE 0 END AS is_area, + td.category_id AS category_id, + g.hosted_by_gobject_id AS hosted_by_gobject_id, ISNULL( STUFF(( SELECT '|' + tc.template_tag_name diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/MxAccess/GalaxyRuntimeProbeManager.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/MxAccess/GalaxyRuntimeProbeManager.cs new file mode 100644 index 0000000..af3779d --- /dev/null +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/MxAccess/GalaxyRuntimeProbeManager.cs @@ -0,0 +1,404 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Serilog; +using ZB.MOM.WW.LmxOpcUa.Host.Domain; + +namespace ZB.MOM.WW.LmxOpcUa.Host.MxAccess +{ + /// + /// Advises <ObjectName>.ScanState on every deployed $WinPlatform and + /// $AppEngine, tracks their runtime state (Unknown / Running / Stopped), and notifies + /// the owning node manager on Running↔Stopped transitions so it can proactively flip every + /// OPC UA variable hosted by that object to BadOutOfService (and clear on recovery). + /// + /// + /// State machine semantics are documented in runtimestatus.md. Key facts: + /// + /// ScanState is delivered on-change only — no periodic heartbeat. A stably + /// Running host may go hours without a callback. + /// Running → Stopped is driven by explicit error callbacks or ScanState = false, + /// NEVER by starvation. The only starvation check applies to the initial Unknown state. + /// When the MxAccess transport is disconnected, returns every + /// entry with regardless of the underlying state, + /// because we can't observe anything through a dead transport. + /// The stop/start callbacks fire synchronously from whichever thread delivered the + /// probe update. The manager releases its own lock before invoking them to avoid + /// lock-inversion deadlocks with the node manager's Lock. + /// + /// + public sealed class GalaxyRuntimeProbeManager : IDisposable + { + private static readonly ILogger Log = Serilog.Log.ForContext(); + + private const int CategoryWinPlatform = 1; + private const int CategoryAppEngine = 3; + private const string KindWinPlatform = "$WinPlatform"; + private const string KindAppEngine = "$AppEngine"; + private const string ProbeAttribute = ".ScanState"; + + private readonly IMxAccessClient _client; + private readonly TimeSpan _unknownTimeout; + private readonly Action? _onHostStopped; + private readonly Action? _onHostRunning; + private readonly Func _clock; + + // Key: probe tag reference (e.g. "DevAppEngine.ScanState"). + // Value: the current runtime status for that host, kept in sync on every probe callback + // and queried via GetSnapshot for dashboard rendering. + private readonly Dictionary _byProbe = + new Dictionary(StringComparer.OrdinalIgnoreCase); + + // Reverse index: gobject_id -> probe tag, so Sync() can diff new/removed hosts efficiently. + private readonly Dictionary _probeByGobjectId = new Dictionary(); + + private readonly object _lock = new object(); + private bool _disposed; + + /// + /// Initializes a new probe manager. and + /// are invoked synchronously on Running↔Stopped + /// transitions so the owning node manager can invalidate / restore the hosted subtree. + /// + public GalaxyRuntimeProbeManager( + IMxAccessClient client, + int unknownTimeoutSeconds, + Action? onHostStopped = null, + Action? onHostRunning = null) + : this(client, unknownTimeoutSeconds, onHostStopped, onHostRunning, () => DateTime.UtcNow) + { + } + + internal GalaxyRuntimeProbeManager( + IMxAccessClient client, + int unknownTimeoutSeconds, + Action? onHostStopped, + Action? onHostRunning, + Func clock) + { + _client = client ?? throw new ArgumentNullException(nameof(client)); + _unknownTimeout = TimeSpan.FromSeconds(Math.Max(1, unknownTimeoutSeconds)); + _onHostStopped = onHostStopped; + _onHostRunning = onHostRunning; + _clock = clock ?? throw new ArgumentNullException(nameof(clock)); + } + + /// + /// Gets the number of active probe subscriptions. Surfaced on the dashboard Subscriptions + /// panel so operators can see bridge-owned probe count separately from the total. + /// + public int ActiveProbeCount + { + get + { + lock (_lock) + return _byProbe.Count; + } + } + + /// + /// Diffs the supplied hierarchy against the active probe set, advising new hosts and + /// unadvising removed ones. The hierarchy is filtered to runtime host categories + /// ($WinPlatform, $AppEngine) — non-host rows are ignored. Idempotent: a second call + /// with the same hierarchy performs no Advise / Unadvise work. + /// + /// + /// Sync is synchronous on MxAccess: is + /// awaited for each new host, so for a galaxy with N runtime hosts the call blocks for + /// ~N round-trips. This is acceptable because it only runs during address-space build + /// and rebuild, not on the hot path. + /// + public async Task SyncAsync(IReadOnlyList hierarchy) + { + if (_disposed || hierarchy == null) + return; + + // Filter to runtime hosts and project to the expected probe tag name. + var desired = new Dictionary(); + foreach (var obj in hierarchy) + { + if (obj.CategoryId != CategoryWinPlatform && obj.CategoryId != CategoryAppEngine) + continue; + if (string.IsNullOrWhiteSpace(obj.TagName)) + continue; + var probe = obj.TagName + ProbeAttribute; + var kind = obj.CategoryId == CategoryWinPlatform ? KindWinPlatform : KindAppEngine; + desired[obj.GobjectId] = (probe, kind, obj); + } + + // Compute diffs under lock, release lock before issuing SDK calls (which can block). + List toSubscribe; + List toUnsubscribe; + lock (_lock) + { + toSubscribe = new List(); + toUnsubscribe = new List(); + + foreach (var kvp in desired) + { + if (_probeByGobjectId.TryGetValue(kvp.Key, out var existingProbe)) + { + // Already tracked: ensure the status entry is aligned (tag rename path is + // intentionally not supported — if the probe changed, treat it as remove+add). + if (!string.Equals(existingProbe, kvp.Value.Probe, StringComparison.OrdinalIgnoreCase)) + { + toUnsubscribe.Add(existingProbe); + _byProbe.Remove(existingProbe); + _probeByGobjectId.Remove(kvp.Key); + + toSubscribe.Add(kvp.Value.Probe); + _byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind); + _probeByGobjectId[kvp.Key] = kvp.Value.Probe; + } + } + else + { + toSubscribe.Add(kvp.Value.Probe); + _byProbe[kvp.Value.Probe] = MakeInitialStatus(kvp.Value.Obj, kvp.Value.Kind); + _probeByGobjectId[kvp.Key] = kvp.Value.Probe; + } + } + + // Remove hosts that are no longer in the desired set. + var toRemove = _probeByGobjectId.Keys.Where(id => !desired.ContainsKey(id)).ToList(); + foreach (var id in toRemove) + { + var probe = _probeByGobjectId[id]; + toUnsubscribe.Add(probe); + _byProbe.Remove(probe); + _probeByGobjectId.Remove(id); + } + } + + // Apply the diff outside the lock. + foreach (var probe in toSubscribe) + { + try + { + await _client.SubscribeAsync(probe, OnProbeValueChanged); + Log.Information("Galaxy runtime probe advised: {Probe}", probe); + } + catch (Exception ex) + { + Log.Warning(ex, "Failed to advise galaxy runtime probe {Probe}", probe); + } + } + + foreach (var probe in toUnsubscribe) + { + try + { + await _client.UnsubscribeAsync(probe); + } + catch (Exception ex) + { + Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during sync", probe); + } + } + } + + /// + /// Routes an OnTagValueChanged callback to the probe state machine. Returns + /// when matches a bridge-owned probe + /// (in which case the owning node manager should skip its normal variable-update path). + /// + public bool HandleProbeUpdate(string tagRef, Vtq vtq) + { + if (_disposed || string.IsNullOrEmpty(tagRef)) + return false; + + GalaxyRuntimeStatus? status; + int fromToGobjectId = 0; + GalaxyRuntimeState? transitionTo = null; + + lock (_lock) + { + if (!_byProbe.TryGetValue(tagRef, out status)) + return false; // not a probe — let the caller handle it normally + + var now = _clock(); + var isRunning = vtq.Quality.IsGood() && vtq.Value is bool b && b; + status.LastStateCallbackTime = now; + status.LastScanState = vtq.Value as bool?; + + if (isRunning) + { + status.GoodUpdateCount++; + status.LastError = null; + if (status.State != GalaxyRuntimeState.Running) + { + status.State = GalaxyRuntimeState.Running; + status.LastStateChangeTime = now; + transitionTo = GalaxyRuntimeState.Running; + fromToGobjectId = status.GobjectId; + } + } + else + { + status.FailureCount++; + status.LastError = BuildErrorDetail(vtq); + if (status.State != GalaxyRuntimeState.Stopped) + { + status.State = GalaxyRuntimeState.Stopped; + status.LastStateChangeTime = now; + transitionTo = GalaxyRuntimeState.Stopped; + fromToGobjectId = status.GobjectId; + } + } + } + + // Invoke transition callbacks outside the lock to avoid inverting the node manager's + // lock order when it subsequently takes its own Lock to flip hosted variables. + if (transitionTo == GalaxyRuntimeState.Stopped) + { + Log.Information("Galaxy runtime {Probe} transitioned Running → Stopped ({Err})", + tagRef, status?.LastError ?? "(no detail)"); + try { _onHostStopped?.Invoke(fromToGobjectId); } + catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw for {Probe}", tagRef); } + } + else if (transitionTo == GalaxyRuntimeState.Running) + { + Log.Information("Galaxy runtime {Probe} transitioned → Running", tagRef); + try { _onHostRunning?.Invoke(fromToGobjectId); } + catch (Exception ex) { Log.Warning(ex, "onHostRunning callback threw for {Probe}", tagRef); } + } + + return true; + } + + /// + /// Periodic tick — flips Unknown entries to Stopped once their registration has been + /// outstanding for longer than the configured timeout without ever receiving a first + /// callback. Does nothing to Running or Stopped entries. + /// + public void Tick() + { + if (_disposed) + return; + + var transitions = new List(); + lock (_lock) + { + var now = _clock(); + foreach (var entry in _byProbe.Values) + { + if (entry.State != GalaxyRuntimeState.Unknown) + continue; + + // LastStateChangeTime is set at creation to "now" so the timeout is measured + // from when the probe was advised. + if (entry.LastStateChangeTime.HasValue + && now - entry.LastStateChangeTime.Value > _unknownTimeout) + { + entry.State = GalaxyRuntimeState.Stopped; + entry.LastStateChangeTime = now; + entry.FailureCount++; + entry.LastError = "Probe never received an initial callback within the unknown-resolution timeout"; + transitions.Add(entry.GobjectId); + } + } + } + + foreach (var gobjectId in transitions) + { + Log.Warning("Galaxy runtime gobject {GobjectId} timed out in Unknown state → Stopped", gobjectId); + try { _onHostStopped?.Invoke(gobjectId); } + catch (Exception ex) { Log.Warning(ex, "onHostStopped callback threw during tick for {GobjectId}", gobjectId); } + } + } + + /// + /// Returns a read-only snapshot of every tracked host. When the MxAccess transport is + /// disconnected, every entry is rewritten to Unknown on the way out so operators aren't + /// misled by cached per-host state — the Connection panel is the primary signal in that + /// case. The underlying _byProbe map is not modified. + /// + public IReadOnlyList GetSnapshot() + { + var transportDown = _client.State != ConnectionState.Connected; + + lock (_lock) + { + var result = new List(_byProbe.Count); + foreach (var entry in _byProbe.Values) + result.Add(Clone(entry, forceUnknown: transportDown)); + // Stable ordering by name so dashboard rows don't jitter between refreshes. + result.Sort((a, b) => string.CompareOrdinal(a.ObjectName, b.ObjectName)); + return result; + } + } + + /// + public void Dispose() + { + List probes; + lock (_lock) + { + if (_disposed) + return; + _disposed = true; + probes = _byProbe.Keys.ToList(); + _byProbe.Clear(); + _probeByGobjectId.Clear(); + } + + foreach (var probe in probes) + { + try + { + _client.UnsubscribeAsync(probe).GetAwaiter().GetResult(); + } + catch (Exception ex) + { + Log.Debug(ex, "Failed to unadvise galaxy runtime probe {Probe} during Dispose", probe); + } + } + } + + private void OnProbeValueChanged(string tagRef, Vtq vtq) + { + HandleProbeUpdate(tagRef, vtq); + } + + private GalaxyRuntimeStatus MakeInitialStatus(GalaxyObjectInfo obj, string kind) + { + return new GalaxyRuntimeStatus + { + ObjectName = obj.TagName, + GobjectId = obj.GobjectId, + Kind = kind, + State = GalaxyRuntimeState.Unknown, + LastStateChangeTime = _clock() + }; + } + + private static GalaxyRuntimeStatus Clone(GalaxyRuntimeStatus src, bool forceUnknown) + { + return new GalaxyRuntimeStatus + { + ObjectName = src.ObjectName, + GobjectId = src.GobjectId, + Kind = src.Kind, + State = forceUnknown ? GalaxyRuntimeState.Unknown : src.State, + LastStateCallbackTime = src.LastStateCallbackTime, + LastStateChangeTime = src.LastStateChangeTime, + LastScanState = src.LastScanState, + LastError = forceUnknown ? null : src.LastError, + GoodUpdateCount = src.GoodUpdateCount, + FailureCount = src.FailureCount + }; + } + + private static string BuildErrorDetail(Vtq vtq) + { + if (vtq.Quality.IsBad()) + return $"bad quality ({vtq.Quality})"; + if (vtq.Quality.IsUncertain()) + return $"uncertain quality ({vtq.Quality})"; + if (vtq.Value is bool b && !b) + return "ScanState = false (OffScan)"; + return $"unexpected value: {vtq.Value ?? "(null)"}"; + } + } +} diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxNodeManager.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxNodeManager.cs index 59ddd27..4db5d69 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxNodeManager.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxNodeManager.cs @@ -10,6 +10,7 @@ using Serilog; using ZB.MOM.WW.LmxOpcUa.Host.Domain; using ZB.MOM.WW.LmxOpcUa.Host.Historian; using ZB.MOM.WW.LmxOpcUa.Host.Metrics; +using ZB.MOM.WW.LmxOpcUa.Host.MxAccess; namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa { @@ -32,6 +33,19 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa private readonly AlarmObjectFilter? _alarmObjectFilter; private int _alarmFilterIncludedObjectCount; private readonly bool _anonymousCanWrite; + + // Host → list of OPC UA variable nodes transitively hosted by that host. Populated during + // BuildAddressSpace by walking each variable's owning object's hosted_by_gobject_id chain + // up to the nearest $WinPlatform or $AppEngine. A variable that lives under a nested host + // (e.g. a user object under an Engine under a Platform) appears in BOTH the Engine's and + // the Platform's list. Used by MarkHostVariablesBadQuality / ClearHostVariablesBadQuality + // when the galaxy runtime probe reports a host transition. + private readonly Dictionary> _hostedVariables = + new Dictionary>(); + + // Runtime status probe manager — null when MxAccessConfiguration.RuntimeStatusProbesEnabled + // is false. Built at construction time and synced to the hierarchy on every BuildAddressSpace. + private readonly GalaxyRuntimeProbeManager? _galaxyRuntimeProbeManager; private readonly AutoResetEvent _dataChangeSignal = new(false); private readonly Dictionary> _gobjectToTagRefs = new(); private readonly HistoryContinuationPointManager _historyContinuations = new(); @@ -106,7 +120,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa NodeId? writeTuneRoleId = null, NodeId? writeConfigureRoleId = null, NodeId? alarmAckRoleId = null, - AlarmObjectFilter? alarmObjectFilter = null) + AlarmObjectFilter? alarmObjectFilter = null, + bool runtimeStatusProbesEnabled = false, + int runtimeStatusUnknownTimeoutSeconds = 15) : base(server, configuration, namespaceUri) { _namespaceUri = namespaceUri; @@ -121,6 +137,15 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa _writeConfigureRoleId = writeConfigureRoleId; _alarmAckRoleId = alarmAckRoleId; + if (runtimeStatusProbesEnabled) + { + _galaxyRuntimeProbeManager = new GalaxyRuntimeProbeManager( + _mxAccessClient, + runtimeStatusUnknownTimeoutSeconds, + MarkHostVariablesBadQuality, + ClearHostVariablesBadQuality); + } + // Wire up data change delivery _mxAccessClient.OnTagValueChanged += OnMxAccessDataChange; @@ -190,6 +215,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa public IReadOnlyList AlarmFilterPatterns => _alarmObjectFilter?.RawPatterns ?? Array.Empty(); + /// + /// Gets a snapshot of the runtime host states (Platforms + AppEngines). Returns an empty + /// list when runtime status probing is disabled. The snapshot respects MxAccess transport + /// state — when the client is disconnected, every entry is returned as + /// . + /// + public IReadOnlyList RuntimeStatuses => + _galaxyRuntimeProbeManager?.GetSnapshot() ?? (IReadOnlyList)Array.Empty(); + + /// + /// Gets the number of bridge-owned runtime status probe subscriptions. Surfaced on the + /// dashboard Subscriptions panel to distinguish probe overhead from client subscriptions. + /// + public int ActiveRuntimeProbeCount => _galaxyRuntimeProbeManager?.ActiveProbeCount ?? 0; + /// /// Gets the runtime historian health snapshot, or when the historian /// plugin is not loaded. Surfaced on the status dashboard so operators can detect query @@ -261,6 +301,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa _alarmDescTags.Clear(); _nodeMap.Clear(); _gobjectToTagRefs.Clear(); + _hostedVariables.Clear(); VariableNodeCount = 0; ObjectNodeCount = 0; @@ -464,12 +505,20 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa if (_alarmTrackingEnabled) SubscribeAlarmTags(); + BuildHostedVariablesMap(hierarchy); + + // Sync the galaxy runtime probe set against the rebuilt hierarchy. This runs + // synchronously on the calling thread and issues AdviseSupervisory per host — + // expected 500ms-1s additional startup latency for a large multi-host galaxy. + _galaxyRuntimeProbeManager?.SyncAsync(hierarchy).GetAwaiter().GetResult(); + _lastHierarchy = new List(hierarchy); _lastAttributes = new List(attributes); Log.Information( - "Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags", - ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count); + "Address space built: {Objects} objects, {Variables} variables, {Mappings} tag references, {Alarms} alarm tags, {Hosts} runtime hosts", + ObjectNodeCount, VariableNodeCount, _nodeIdToTagReference.Count, _alarmInAlarmTags.Count, + _hostedVariables.Count); } } @@ -499,6 +548,120 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa return includedIds; } + /// + /// Builds the _hostedVariables dictionary from the completed address space. For each + /// Galaxy object, walks its HostedByGobjectId chain up to the nearest $WinPlatform + /// or $AppEngine and appends every variable the object owns to that host's list. An + /// object under an Engine under a Platform appears in BOTH lists so stopping the Platform + /// invalidates every descendant Engine's variables as well. + /// + private void BuildHostedVariablesMap(List hierarchy) + { + _hostedVariables.Clear(); + if (hierarchy == null || hierarchy.Count == 0) + return; + + var byId = new Dictionary(hierarchy.Count); + foreach (var obj in hierarchy) + byId[obj.GobjectId] = obj; + + foreach (var obj in hierarchy) + { + if (!_gobjectToTagRefs.TryGetValue(obj.GobjectId, out var tagRefs) || tagRefs.Count == 0) + continue; + + // Collect every variable node owned by this object from the tag→variable map. + var ownedVariables = new List(tagRefs.Count); + foreach (var tagRef in tagRefs) + if (_tagToVariableNode.TryGetValue(tagRef, out var v)) + ownedVariables.Add(v); + + if (ownedVariables.Count == 0) + continue; + + // Walk HostedByGobjectId up the chain, appending to every Platform/Engine encountered. + // Visited set defends against cycles in misconfigured galaxies. + var visited = new HashSet(); + var cursor = obj; + var depth = 0; + while (cursor != null && depth < 32 && visited.Add(cursor.GobjectId)) + { + if (cursor.CategoryId == 1 || cursor.CategoryId == 3) + { + if (!_hostedVariables.TryGetValue(cursor.GobjectId, out var list)) + { + list = new List(); + _hostedVariables[cursor.GobjectId] = list; + } + list.AddRange(ownedVariables); + } + + if (cursor.HostedByGobjectId == 0 || + !byId.TryGetValue(cursor.HostedByGobjectId, out var next)) + break; + cursor = next; + depth++; + } + } + } + + /// + /// Flips every OPC UA variable hosted by the given Galaxy runtime object (Platform or + /// AppEngine) to . Invoked by the runtime probe + /// manager's Running → Stopped callback. Safe to call with an unknown gobject id — no-op. + /// + /// The runtime host's gobject_id. + public void MarkHostVariablesBadQuality(int gobjectId) + { + List? variables; + lock (Lock) + { + if (!_hostedVariables.TryGetValue(gobjectId, out variables)) + return; + + var now = DateTime.UtcNow; + foreach (var variable in variables) + { + variable.StatusCode = StatusCodes.BadOutOfService; + variable.Timestamp = now; + variable.ClearChangeMasks(SystemContext, false); + } + } + + Log.Information( + "Marked {Count} variable(s) BadOutOfService for stopped host gobject_id={GobjectId}", + variables.Count, gobjectId); + } + + /// + /// Resets every OPC UA variable hosted by the given Galaxy runtime object to + /// . Invoked by the runtime probe manager's Stopped → Running + /// callback. Values are left as-is; subsequent MxAccess on-change updates will refresh them + /// as tags change naturally. + /// + /// The runtime host's gobject_id. + public void ClearHostVariablesBadQuality(int gobjectId) + { + List? variables; + lock (Lock) + { + if (!_hostedVariables.TryGetValue(gobjectId, out variables)) + return; + + var now = DateTime.UtcNow; + foreach (var variable in variables) + { + variable.StatusCode = StatusCodes.Good; + variable.Timestamp = now; + variable.ClearChangeMasks(SystemContext, false); + } + } + + Log.Information( + "Cleared bad-quality override on {Count} variable(s) for recovered host gobject_id={GobjectId}", + variables.Count, gobjectId); + } + private void SubscribeAlarmTags() { foreach (var kvp in _alarmInAlarmTags) @@ -2116,6 +2279,14 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa if (_dispatchDisposed) return; + // Runtime status probes are bridge-owned subscriptions whose only job is to drive the + // host state machine; they are NOT in _tagToVariableNode, so the normal dispatch path + // would drop them anyway. Route probe addresses directly to the probe manager and skip + // the dispatch queue entirely. + if (_galaxyRuntimeProbeManager != null + && _galaxyRuntimeProbeManager.HandleProbeUpdate(address, vtq)) + return; + Interlocked.Increment(ref _totalMxChangeEvents); _pendingDataChanges[address] = vtq; try @@ -2162,6 +2333,12 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa if (!_dispatchRunning) break; + // Drive time-based probe state transitions on every dispatch tick. The dispatch + // loop already wakes every 100ms via the WaitOne timeout, so this gives us a + // ~10Hz cadence for the Unknown → Stopped timeout without introducing a new + // thread or timer. No-op when the probe manager is disabled. + _galaxyRuntimeProbeManager?.Tick(); + var keys = _pendingDataChanges.Keys.ToList(); if (keys.Count == 0) { @@ -2376,6 +2553,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa { _dispatchDisposed = true; _mxAccessClient.OnTagValueChanged -= OnMxAccessDataChange; + // Dispose the runtime probe manager before the MxAccess client teardown so its + // Unadvise calls reach a live client. Disposing the node manager normally runs + // BEFORE the node manager's containing OpcUaServerHost releases the MxAccess + // client, so the probes close cleanly. + _galaxyRuntimeProbeManager?.Dispose(); StopDispatchThread(); _dataChangeSignal.Dispose(); } diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxOpcUaServer.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxOpcUaServer.cs index 27508f7..1b16d2b 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxOpcUaServer.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/LmxOpcUaServer.cs @@ -37,11 +37,16 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa private NodeId? _writeOperateRoleId; private NodeId? _writeTuneRoleId; + private readonly bool _runtimeStatusProbesEnabled; + private readonly int _runtimeStatusUnknownTimeoutSeconds; + public LmxOpcUaServer(string galaxyName, IMxAccessClient mxAccessClient, PerformanceMetrics metrics, IHistorianDataSource? historianDataSource = null, bool alarmTrackingEnabled = false, AuthenticationConfiguration? authConfig = null, IUserAuthenticationProvider? authProvider = null, RedundancyConfiguration? redundancyConfig = null, string? applicationUri = null, - AlarmObjectFilter? alarmObjectFilter = null) + AlarmObjectFilter? alarmObjectFilter = null, + bool runtimeStatusProbesEnabled = false, + int runtimeStatusUnknownTimeoutSeconds = 15) { _galaxyName = galaxyName; _mxAccessClient = mxAccessClient; @@ -53,6 +58,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa _authProvider = authProvider; _redundancyConfig = redundancyConfig ?? new RedundancyConfiguration(); _applicationUri = applicationUri; + _runtimeStatusProbesEnabled = runtimeStatusProbesEnabled; + _runtimeStatusUnknownTimeoutSeconds = runtimeStatusUnknownTimeoutSeconds; } /// @@ -89,7 +96,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa NodeManager = new LmxNodeManager(server, configuration, namespaceUri, _mxAccessClient, _metrics, _historianDataSource, _alarmTrackingEnabled, _authConfig.AnonymousCanWrite, _writeOperateRoleId, _writeTuneRoleId, _writeConfigureRoleId, _alarmAckRoleId, - _alarmObjectFilter); + _alarmObjectFilter, + _runtimeStatusProbesEnabled, _runtimeStatusUnknownTimeoutSeconds); var nodeManagers = new List { NodeManager }; return new MasterNodeManager(server, configuration, null, nodeManagers.ToArray()); diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/OpcUaServerHost.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/OpcUaServerHost.cs index ad96a87..58b97bf 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/OpcUaServerHost.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUa/OpcUaServerHost.cs @@ -45,7 +45,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa IUserAuthenticationProvider? authProvider = null, SecurityProfileConfiguration? securityConfig = null, RedundancyConfiguration? redundancyConfig = null, - AlarmObjectFilter? alarmObjectFilter = null) + AlarmObjectFilter? alarmObjectFilter = null, + MxAccessConfiguration? mxAccessConfig = null) { _config = config; _mxAccessClient = mxAccessClient; @@ -56,8 +57,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa _securityConfig = securityConfig ?? new SecurityProfileConfiguration(); _redundancyConfig = redundancyConfig ?? new RedundancyConfiguration(); _alarmObjectFilter = alarmObjectFilter; + _mxAccessConfig = mxAccessConfig ?? new MxAccessConfiguration(); } + private readonly MxAccessConfiguration _mxAccessConfig; + /// /// Gets the active node manager that holds the published Galaxy namespace. /// @@ -239,7 +243,9 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.OpcUa _server = new LmxOpcUaServer(_config.GalaxyName, _mxAccessClient, _metrics, _historianDataSource, _config.AlarmTrackingEnabled, _authConfig, _authProvider, _redundancyConfig, applicationUri, - _alarmObjectFilter); + _alarmObjectFilter, + _mxAccessConfig.RuntimeStatusProbesEnabled, + _mxAccessConfig.RuntimeStatusUnknownTimeoutSeconds); await _application.Start(_server); Log.Information( diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUaService.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUaService.cs index ec3ee19..ccc8aea 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUaService.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/OpcUaService.cs @@ -245,7 +245,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host string.Join(", ", _config.OpcUa.AlarmFilter.ObjectFilters)); ServerHost = new OpcUaServerHost(_config.OpcUa, effectiveMxClient, Metrics, _historianDataSource, - _config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter); + _config.Authentication, authProvider, _config.Security, _config.Redundancy, alarmObjectFilter, + _config.MxAccess); // Step 9-10: Query hierarchy, start server, build address space DateTime? initialDeployTime = null; diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/HealthCheckService.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/HealthCheckService.cs index 07a277b..252c69c 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/HealthCheckService.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/HealthCheckService.cs @@ -1,3 +1,4 @@ +using System.Linq; using ZB.MOM.WW.LmxOpcUa.Host.Domain; using ZB.MOM.WW.LmxOpcUa.Host.Metrics; @@ -21,7 +22,8 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status ConnectionState connectionState, PerformanceMetrics? metrics, HistorianStatusInfo? historian = null, - AlarmStatusInfo? alarms = null) + AlarmStatusInfo? alarms = null, + RuntimeStatusInfo? runtime = null) { // Rule 1: Not connected → Unhealthy if (connectionState != ConnectionState.Connected) @@ -98,6 +100,23 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status Color = "yellow" }; + // Rule 2e: Any Galaxy runtime host (Platform/AppEngine) is Stopped → Degraded. + // Runs after the transport check so that MxAccess-disconnected remains Unhealthy via + // Rule 1 without also firing the runtime rule — avoids a double-message when the + // transport is the root cause of every host going Unknown/Stopped. + if (runtime != null && runtime.StoppedCount > 0) + { + var stoppedNames = string.Join(", ", + runtime.Hosts.Where(h => h.State == Domain.GalaxyRuntimeState.Stopped).Select(h => h.ObjectName)); + return new HealthInfo + { + Status = "Degraded", + Message = + $"Galaxy runtime has {runtime.StoppedCount} of {runtime.Total} host(s) stopped: {stoppedNames}", + Color = "yellow" + }; + } + // Rule 3: All good return new HealthInfo { diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusData.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusData.cs index d33587f..c4f0aba 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusData.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusData.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using ZB.MOM.WW.LmxOpcUa.Host.Domain; using ZB.MOM.WW.LmxOpcUa.Host.Metrics; namespace ZB.MOM.WW.LmxOpcUa.Host.Status @@ -59,12 +60,49 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status /// public EndpointsInfo Endpoints { get; set; } = new(); + /// + /// Gets or sets the Galaxy runtime host state (Platforms + AppEngines). + /// + public RuntimeStatusInfo RuntimeStatus { get; set; } = new(); + /// /// Gets or sets footer details such as the snapshot timestamp and service version. /// public FooterInfo Footer { get; set; } = new(); } + /// + /// Dashboard model summarizing per-host Galaxy runtime state. + /// + public class RuntimeStatusInfo + { + /// + /// Gets or sets the total number of tracked runtime hosts ($WinPlatform + $AppEngine). + /// + public int Total { get; set; } + + /// + /// Gets or sets the count of hosts currently reported Running. + /// + public int RunningCount { get; set; } + + /// + /// Gets or sets the count of hosts currently reported Stopped. + /// + public int StoppedCount { get; set; } + + /// + /// Gets or sets the count of hosts whose state is still Unknown (either awaiting initial + /// probe resolution or transported-through-disconnected). + /// + public int UnknownCount { get; set; } + + /// + /// Gets or sets the per-host state in stable alphabetical order. + /// + public List Hosts { get; set; } = new(); + } + /// /// Dashboard model describing the OPC UA server's listening endpoints and active security profiles. /// @@ -156,8 +194,17 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status { /// /// Gets or sets the number of active tag subscriptions mirrored from MXAccess into OPC UA. + /// This total includes bridge-owned runtime status probes; see for the + /// subset attributable to probes. /// public int ActiveCount { get; set; } + + /// + /// Gets or sets the count of bridge-owned runtime status probes included in + /// . Surfaced on the dashboard so operators can distinguish probe + /// overhead from client-driven subscription load. + /// + public int ProbeCount { get; set; } } /// diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusReportService.cs b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusReportService.cs index 1fe7935..930a72b 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusReportService.cs +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/Status/StatusReportService.cs @@ -88,10 +88,11 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status ReconnectCount = _mxAccessClient?.ReconnectCount ?? 0, ActiveSessions = _serverHost?.ActiveSessionCount ?? 0 }, - Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo), + Health = _healthCheck.CheckHealth(connectionState, _metrics, historianInfo, alarmInfo, BuildRuntimeStatusInfo()), Subscriptions = new SubscriptionInfo { - ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0 + ActiveCount = _mxAccessClient?.ActiveSubscriptionCount ?? 0, + ProbeCount = _nodeManager?.ActiveRuntimeProbeCount ?? 0 }, Galaxy = new GalaxyInfo { @@ -114,6 +115,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status Alarms = alarmInfo, Redundancy = BuildRedundancyInfo(), Endpoints = BuildEndpointsInfo(), + RuntimeStatus = BuildRuntimeStatusInfo(), Footer = new FooterInfo { Timestamp = DateTime.UtcNow, @@ -192,6 +194,26 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status return info; } + private RuntimeStatusInfo BuildRuntimeStatusInfo() + { + var hosts = _nodeManager?.RuntimeStatuses?.ToList() ?? new List(); + var info = new RuntimeStatusInfo + { + Total = hosts.Count, + Hosts = hosts + }; + foreach (var host in hosts) + { + switch (host.State) + { + case GalaxyRuntimeState.Running: info.RunningCount++; break; + case GalaxyRuntimeState.Stopped: info.StoppedCount++; break; + default: info.UnknownCount++; break; + } + } + return info; + } + private RedundancyInfo? BuildRedundancyInfo() { if (_redundancyConfig == null || !_redundancyConfig.Enabled) @@ -300,7 +322,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status // Subscriptions panel sb.AppendLine("

Subscriptions

"); - sb.AppendLine($"

Active: {data.Subscriptions.ActiveCount}

"); + sb.AppendLine($"

Active: {data.Subscriptions.ActiveCount}

"); + if (data.Subscriptions.ProbeCount > 0) + sb.AppendLine( + $"

Probes: {data.Subscriptions.ProbeCount} (bridge-owned runtime status)

"); sb.AppendLine("
"); // Data Change Dispatch panel @@ -318,6 +343,32 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status sb.AppendLine($"

Last Rebuild: {data.Galaxy.LastRebuildTime:O}

"); sb.AppendLine(""); + // Galaxy Runtime panel — per-host Platform + AppEngine state + if (data.RuntimeStatus.Total > 0) + { + var rtColor = data.RuntimeStatus.StoppedCount > 0 ? "red" + : data.RuntimeStatus.UnknownCount > 0 ? "yellow" + : "green"; + sb.AppendLine($"

Galaxy Runtime

"); + sb.AppendLine( + $"

{data.RuntimeStatus.RunningCount} of {data.RuntimeStatus.Total} hosts running" + + $" ({data.RuntimeStatus.StoppedCount} stopped, {data.RuntimeStatus.UnknownCount} unknown)

"); + sb.AppendLine(""); + foreach (var host in data.RuntimeStatus.Hosts) + { + var since = host.LastStateChangeTime?.ToString("O") ?? "-"; + var err = WebUtility.HtmlEncode(host.LastError ?? ""); + sb.AppendLine( + $"" + + $"" + + $"" + + $"" + + $""); + } + sb.AppendLine("
NameKindStateSinceLast Error
{WebUtility.HtmlEncode(host.ObjectName)}{WebUtility.HtmlEncode(host.Kind)}{host.State}{since}{err}
"); + sb.AppendLine("
"); + } + // Historian panel var anyClusterNodeFailed = data.Historian.NodeCount > 0 && data.Historian.HealthyNodeCount < data.Historian.NodeCount; diff --git a/src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json b/src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json index 9506e39..73cd7fc 100644 --- a/src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json +++ b/src/ZB.MOM.WW.LmxOpcUa.Host/appsettings.json @@ -23,7 +23,9 @@ "MonitorIntervalSeconds": 5, "AutoReconnect": true, "ProbeTag": null, - "ProbeStaleThresholdSeconds": 60 + "ProbeStaleThresholdSeconds": 60, + "RuntimeStatusProbesEnabled": true, + "RuntimeStatusUnknownTimeoutSeconds": 15 }, "GalaxyRepository": { "ConnectionString": "Server=localhost;Database=ZB;Integrated Security=true;", diff --git a/tests/ZB.MOM.WW.LmxOpcUa.Tests/MxAccess/GalaxyRuntimeProbeManagerTests.cs b/tests/ZB.MOM.WW.LmxOpcUa.Tests/MxAccess/GalaxyRuntimeProbeManagerTests.cs new file mode 100644 index 0000000..146b106 --- /dev/null +++ b/tests/ZB.MOM.WW.LmxOpcUa.Tests/MxAccess/GalaxyRuntimeProbeManagerTests.cs @@ -0,0 +1,396 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Shouldly; +using Xunit; +using ZB.MOM.WW.LmxOpcUa.Host.Domain; +using ZB.MOM.WW.LmxOpcUa.Host.MxAccess; +using ZB.MOM.WW.LmxOpcUa.Tests.Helpers; + +namespace ZB.MOM.WW.LmxOpcUa.Tests.MxAccess +{ + /// + /// Exhaustive coverage of the runtime host probe manager: state machine, sync diff, + /// transport gating, unknown-resolution timeout, and transition callbacks. + /// + public class GalaxyRuntimeProbeManagerTests + { + // ---------- State transitions ---------- + + [Fact] + public async Task Sync_WithMixedRuntimeHosts_AddsProbesAndEntriesInUnknown() + { + var (client, clock) = (new FakeMxAccessClient(), new Clock()); + var (stopSpy, runSpy) = (new List(), new List()); + using var sut = Sut(client, 15, stopSpy, runSpy, clock); + + await sut.SyncAsync(new[] + { + Platform(10, "DevPlatform"), + Engine(20, "DevAppEngine"), + UserObject(30, "TestMachine_001") + }); + + sut.ActiveProbeCount.ShouldBe(2); + var snap = sut.GetSnapshot(); + snap.Select(s => s.ObjectName).ShouldBe(new[] { "DevAppEngine", "DevPlatform" }); + snap.All(s => s.State == GalaxyRuntimeState.Unknown).ShouldBeTrue(); + snap.First(s => s.ObjectName == "DevPlatform").Kind.ShouldBe("$WinPlatform"); + snap.First(s => s.ObjectName == "DevAppEngine").Kind.ShouldBe("$AppEngine"); + stopSpy.ShouldBeEmpty(); + runSpy.ShouldBeEmpty(); + } + + [Fact] + public async Task HandleProbeUpdate_FirstGoodCallback_TransitionsUnknownToRunning() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + var handled = sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + + handled.ShouldBeTrue(); + var entry = sut.GetSnapshot().Single(); + entry.State.ShouldBe(GalaxyRuntimeState.Running); + entry.LastScanState.ShouldBe(true); + entry.GoodUpdateCount.ShouldBe(1); + entry.FailureCount.ShouldBe(0); + entry.LastError.ShouldBeNull(); + runSpy.ShouldBe(new[] { 20 }); + stopSpy.ShouldBeEmpty(); + } + + [Fact] + public async Task HandleProbeUpdate_ScanStateFalse_TransitionsRunningToStopped() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + stopSpy.Clear(); runSpy.Clear(); + + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(false)); + + var entry = sut.GetSnapshot().Single(); + entry.State.ShouldBe(GalaxyRuntimeState.Stopped); + entry.LastScanState.ShouldBe(false); + entry.FailureCount.ShouldBe(1); + entry.LastError!.ShouldContain("OffScan"); + stopSpy.ShouldBe(new[] { 20 }); + runSpy.ShouldBeEmpty(); + } + + [Fact] + public async Task HandleProbeUpdate_BadQualityCallback_TransitionsRunningToStopped() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Platform(10, "DevPlatform") }); + sut.HandleProbeUpdate("DevPlatform.ScanState", Vtq.Good(true)); + stopSpy.Clear(); + + sut.HandleProbeUpdate("DevPlatform.ScanState", Vtq.Bad(Quality.BadCommFailure)); + + var entry = sut.GetSnapshot().Single(); + entry.State.ShouldBe(GalaxyRuntimeState.Stopped); + entry.LastError!.ShouldContain("bad quality"); + stopSpy.ShouldBe(new[] { 10 }); + } + + [Fact] + public async Task HandleProbeUpdate_RecoveryAfterStopped_FiresRunningCallback() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(false)); + runSpy.Clear(); stopSpy.Clear(); + + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + + runSpy.ShouldBe(new[] { 20 }); + stopSpy.ShouldBeEmpty(); + var entry = sut.GetSnapshot().Single(); + entry.State.ShouldBe(GalaxyRuntimeState.Running); + entry.LastError.ShouldBeNull(); + } + + [Fact] + public async Task HandleProbeUpdate_RepeatedRunning_DoesNotRefire() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + + runSpy.Count.ShouldBe(1); // only the Unknown → Running call fires the callback + sut.GetSnapshot().Single().GoodUpdateCount.ShouldBe(3); + } + + [Fact] + public async Task HandleProbeUpdate_NonProbeAddress_ReturnsFalse() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + var handled = sut.HandleProbeUpdate("UnrelatedObject.Value", Vtq.Good(42)); + + handled.ShouldBeFalse(); + sut.GetSnapshot().Single().GoodUpdateCount.ShouldBe(0); + } + + // ---------- Unknown-resolution timeout ---------- + + [Fact] + public async Task Tick_UnknownBeyondTimeout_TransitionsToStopped() + { + var clock = new Clock { Now = new DateTime(2026, 4, 13, 10, 0, 0, DateTimeKind.Utc) }; + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy, clock); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + // 16 seconds later — past the 15s timeout + clock.Now = clock.Now.AddSeconds(16); + sut.Tick(); + + var entry = sut.GetSnapshot().Single(); + entry.State.ShouldBe(GalaxyRuntimeState.Stopped); + entry.LastError!.ShouldContain("unknown-resolution"); + stopSpy.ShouldBe(new[] { 20 }); + } + + [Fact] + public async Task Tick_UnknownWithinTimeout_DoesNotTransition() + { + var clock = new Clock { Now = new DateTime(2026, 4, 13, 10, 0, 0, DateTimeKind.Utc) }; + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy, clock); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + clock.Now = clock.Now.AddSeconds(10); + sut.Tick(); + + sut.GetSnapshot().Single().State.ShouldBe(GalaxyRuntimeState.Unknown); + stopSpy.ShouldBeEmpty(); + } + + [Fact] + public async Task Tick_RunningHostWithOldCallback_DoesNotTransition() + { + // Critical on-change-semantic test: a stably Running host may go minutes or hours + // without a callback. Tick must NOT time it out on a starvation basis. + var clock = new Clock { Now = new DateTime(2026, 4, 13, 10, 0, 0, DateTimeKind.Utc) }; + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy, clock); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + + clock.Now = clock.Now.AddHours(2); // 2 hours of silence + sut.Tick(); + + sut.GetSnapshot().Single().State.ShouldBe(GalaxyRuntimeState.Running); + } + + // ---------- Transport gating ---------- + + [Fact] + public async Task GetSnapshot_WhenTransportDisconnected_ForcesEveryEntryToUnknown() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] + { + Platform(10, "DevPlatform"), + Engine(20, "DevAppEngine") + }); + sut.HandleProbeUpdate("DevPlatform.ScanState", Vtq.Good(true)); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(false)); + + client.State = ConnectionState.Disconnected; + + sut.GetSnapshot().All(s => s.State == GalaxyRuntimeState.Unknown).ShouldBeTrue(); + + // Underlying state is preserved — restore transport and snapshot reflects reality again. + client.State = ConnectionState.Connected; + var restored = sut.GetSnapshot(); + restored.First(s => s.ObjectName == "DevPlatform").State.ShouldBe(GalaxyRuntimeState.Running); + restored.First(s => s.ObjectName == "DevAppEngine").State.ShouldBe(GalaxyRuntimeState.Stopped); + } + + // ---------- Sync diff ---------- + + [Fact] + public async Task Sync_WithHostRemoved_UnadvisesProbeAndDropsEntry() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] + { + Platform(10, "DevPlatform"), + Engine(20, "DevAppEngine") + }); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + + await sut.SyncAsync(new[] { Platform(10, "DevPlatform") }); + + sut.ActiveProbeCount.ShouldBe(1); + sut.GetSnapshot().Single().ObjectName.ShouldBe("DevPlatform"); + } + + [Fact] + public async Task Sync_WithUnchangedHostSet_PreservesExistingState() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)); + runSpy.Clear(); + + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + sut.GetSnapshot().Single().State.ShouldBe(GalaxyRuntimeState.Running); + runSpy.ShouldBeEmpty(); // no re-fire on no-op resync + } + + [Fact] + public async Task Sync_FiltersNonRuntimeCategories() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + using var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] + { + Platform(10, "DevPlatform"), + UserObject(30, "TestMachine_001"), + AreaObject(40, "DEV"), + Engine(20, "DevAppEngine"), + UserObject(31, "TestMachine_002") + }); + + sut.ActiveProbeCount.ShouldBe(2); // only the platform + the engine + } + + // ---------- Dispose ---------- + + [Fact] + public async Task Dispose_UnadvisesEveryActiveProbe() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] + { + Platform(10, "DevPlatform"), + Engine(20, "DevAppEngine") + }); + + sut.Dispose(); + + sut.ActiveProbeCount.ShouldBe(0); + // After dispose, a Sync is a no-op. + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.ActiveProbeCount.ShouldBe(0); + } + + [Fact] + public void Dispose_OnFreshManager_NoOp() + { + var client = new FakeMxAccessClient(); + var sut = Sut(client, 15, new List(), new List()); + + Should.NotThrow(() => sut.Dispose()); + Should.NotThrow(() => sut.Dispose()); + } + + [Fact] + public async Task HandleProbeUpdate_AfterDispose_ReturnsFalse() + { + var (client, stopSpy, runSpy) = NewSpyHarness(); + var sut = Sut(client, 15, stopSpy, runSpy); + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + sut.Dispose(); + + sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true)).ShouldBeFalse(); + } + + // ---------- Callback exception safety ---------- + + [Fact] + public async Task TransitionCallback_ThrowsException_DoesNotCorruptState() + { + var client = new FakeMxAccessClient(); + Action badCallback = _ => throw new InvalidOperationException("boom"); + using var sut = new GalaxyRuntimeProbeManager(client, 15, badCallback, badCallback); + + await sut.SyncAsync(new[] { Engine(20, "DevAppEngine") }); + + Should.NotThrow(() => sut.HandleProbeUpdate("DevAppEngine.ScanState", Vtq.Good(true))); + sut.GetSnapshot().Single().State.ShouldBe(GalaxyRuntimeState.Running); + } + + // ---------- Helpers ---------- + + private static GalaxyRuntimeProbeManager Sut( + FakeMxAccessClient client, + int timeoutSeconds, + List stopSpy, + List runSpy, + Clock? clock = null) + { + clock ??= new Clock(); + return new GalaxyRuntimeProbeManager( + client, timeoutSeconds, + stopSpy.Add, + runSpy.Add, + () => clock.Now); + } + + private static (FakeMxAccessClient client, List stopSpy, List runSpy) NewSpyHarness() + { + return (new FakeMxAccessClient(), new List(), new List()); + } + + private static GalaxyObjectInfo Platform(int id, string name) => new() + { + GobjectId = id, + TagName = name, + CategoryId = 1, + HostedByGobjectId = 0 + }; + + private static GalaxyObjectInfo Engine(int id, string name) => new() + { + GobjectId = id, + TagName = name, + CategoryId = 3, + HostedByGobjectId = 10 + }; + + private static GalaxyObjectInfo UserObject(int id, string name) => new() + { + GobjectId = id, + TagName = name, + CategoryId = 10, + HostedByGobjectId = 20 + }; + + private static GalaxyObjectInfo AreaObject(int id, string name) => new() + { + GobjectId = id, + TagName = name, + CategoryId = 13, + IsArea = true, + HostedByGobjectId = 20 + }; + + private sealed class Clock + { + public DateTime Now { get; set; } = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + } + } +}