From 39f9fd8946b42f7ea767f24e7f35daf662669772 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 1 May 2026 09:06:45 -0400 Subject: [PATCH] =?UTF-8?q?probe:=20BREAKTHROUGH=20=E2=80=94=20alarms=20fl?= =?UTF-8?q?ow=20via=20canonical=20\Node\Galaxy!Area,=20blocked=20by=20Date?= =?UTF-8?q?Time=20marshaling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings that turn the alarm capture path on: 1. Subscription expression: \\Galaxy! is the canonical AlarmClient subscription format per ArchestrA docs: \Node\Provider!Area!Filter, with Provider literally "Galaxy" (not the Galaxy name) and Node being the machine name. For this rig: \DESKTOP-6JL3KKO\Galaxy!DEV catches alarms. 2. InitializeConsumer before RegisterConsumer — discovered earlier; bug-fix for PR A.5's AlarmClientConsumer. With these in place, GetHighPriAlarm returned a record on every poll for 60s straight (117/117 calls). But every call throws ArgumentOutOfRangeException: Not a valid Win32 FileTime, because AlarmRecord has five DateTime fields (ar_Time / ar_OrigTime / ar_AckTime / ar_RtnTime / ar_SubTime) and AVEVA writes sentinel FILETIME values for unset ones (e.g., ar_AckTime on an unacknowledged alarm). The aaAlarmManagedClient.dll auto-marshals FILETIME -> DateTime and rejects out-of-range values. GetStatistics still reports total=0 active=0 even with GetHighPriAlarm returning records — those two APIs have different views. The active read API for current alarms is GetHighPriAlarm, not GetStatistics's change array. So the consumer chain works. The blocking issue is now extracting the payload past the AVEVA-shipped DateTime auto-marshaling. Three approaches for the next PR: 1. Patch aaAlarmManagedClient.dll via ildasm/ilasm round-trip. 2. Define a custom [ComImport] interface with safe-blittable types and Marshal.QueryInterface to it. 3. Use IDispatch late binding to bypass strong-typed marshaling. Option 2 is cleanest; needs the AlarmClient COM IID. Probe changes: - Subscription expression set to \\Galaxy!DEV. - GetHighPriAlarm tally counters (ok-with-record vs throw). - 117 throws / 0 ok-with-record over 60s confirms alarms are flowing continuously while the user's flip script runs. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/AlarmClientDiscovery.md | 63 +++++++++++++++++++ .../AlarmClientWmProbeTests.cs | 36 ++++++++--- 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/docs/AlarmClientDiscovery.md b/docs/AlarmClientDiscovery.md index 39e4484..0a05b7f 100644 --- a/docs/AlarmClientDiscovery.md +++ b/docs/AlarmClientDiscovery.md @@ -335,6 +335,69 @@ alarms truly only flow through the historian event-storage path, A.2 needs to consume from `aahEventStorage` instead — a fundamental architecture pivot. +## BREAKTHROUGH — seventh probe run, 2026-05-01 + +Two changes finally produced a signal: + +1. **Subscription scope:** `\\\Galaxy!` is the + canonical AlarmClient subscription format (per ArchestrA Alarm + Client docs at `archestra6.rssing.com/chan-12008125/article13.html`): + `\\Node\Provider!Area!Filter`, where Node is the *machine* name, + Provider is **literally `Galaxy`**, and Area is a hosted area + object. For this rig (`\\DESKTOP-6JL3KKO\Galaxy!DEV`) the DEV + area — the platform's primary area — is the right scope. Earlier + `\Galaxy!`, `\Galaxy!TestArea`, `\\.\Galaxy!`, etc., all returned + rc=0 but matched no traffic — they were not the canonical form. +2. **`InitializeConsumer` before `RegisterConsumer`** — already + discovered earlier; bug-fix for PR A.5's `AlarmClientConsumer`. + +With both in place, `GetHighPriAlarm` returned a record on every +poll for 60s straight (117/117 calls), but threw +`ArgumentOutOfRangeException: Not a valid Win32 FileTime` instead +of returning successfully — the AlarmRecord struct contains five +DateTime fields (`ar_Time`, `ar_OrigTime`, `ar_AckTime`, +`ar_RtnTime`, `ar_SubTime`) and AVEVA writes sentinel/invalid +FILETIME values for unset ones (e.g., `ar_AckTime` for an +unacknowledged alarm). The .NET interop that AVEVA ships +(`aaAlarmManagedClient.dll`) auto-converts FILETIME→DateTime and +rejects out-of-range values. + +`GetStatistics` continues to report `total=0 active=0` even with +GetHighPriAlarm returning records — those two API surfaces have +genuinely different views in AVEVA's data model. + +So: **alarms flow through `aaAlarmManagedClient.AlarmClient` once +the subscription expression is canonical**. The blocking issue is +extracting the payload past the .NET interop's DateTime +auto-marshaling. + +## Remaining work to capture alarm payloads + +Define a custom COM interop that uses `long` (FILETIME-as-int64) +instead of `DateTime` for the timestamp fields. Approach options: + +1. **Patch the AVEVA-shipped `aaAlarmManagedClient.dll`** — ildasm + the assembly, replace `DateTime` with `long` on AlarmRecord's + timestamp fields, ilasm back. Brittle across AVEVA upgrades. +2. **Write our own `[ComImport]` interface** — declare + `IRawAlarmConsumer` ourselves with safe-blittable types, + discover the underlying COM IID (via reflection on + `AlarmClient`'s `[Guid]` attribute), and `(IRawAlarmConsumer) + alarmClient` cast. Cleaner; requires the IID. +3. **Use `IDispatch` late binding** — dispatch-Invoke bypasses + strong-typed marshaling. Verbose but doesn't need IIDs. + +For PR A.2's worker integration, option 2 is the least +disruptive. Once the interop is custom, `AlarmClient.Subscribe` + +`GetHighPriAlarm` + `GetAlarmExtendedRec` form a viable +polling-style alarm consumer. + +The polling-vs-WM_APP-callback question from earlier is now +moot: `GetStatistics`'s `positions[]/handles[]` arrays remained +empty even when alarms were demonstrably present. The active +read API for current alarms is `GetHighPriAlarm`, not +`GetStatistics`'s change array. + ### Implications for A.2 implementation The A.2 PR's value is unmeasurable until at least one alarm diff --git a/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs b/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs index 44ac80e..6e156c4 100644 --- a/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs +++ b/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs @@ -35,13 +35,20 @@ public sealed class AlarmClientWmProbeTests : IDisposable // Try multiple subscription expressions sequentially (each Subscribe call // adds to the consumer's scope). The "everything" form varies by AVEVA // version — we shotgun common forms. + // Canonical AlarmClient subscription format (per ArchestrA docs): + // \\Node\Provider!Area!Filter + // - Node: machine name (NOT galaxy name; "Galaxy" is the literal provider) + // - Provider: literal "Galaxy" + // - Area: area object the engine hosts the alarm under + // Note: each Subscribe call REPLACES the prior subscription on the + // consumer, so we test exactly one expression per probe run. + private static readonly string MachineName = Environment.MachineName; private static readonly string[] SubscriptionExpressions = { - @"\Galaxy!", // documented "all groups under Galaxy provider" - @"\Galaxy!*", // wildcard variant - @"\\Galaxy!", // double-backslash UNC-style - @"\Galaxy!TestArea", // explicit area where TestMachine_001 lives - @"\\.\Galaxy!", // local-host prefix + // DEV is the top-level area on the Platform (TestArea is contained + // within DEV). Alarms typically publish at the platform's primary + // area. If TestArea-only doesn't catch them, DEV should. + $@"\\{MachineName}\Galaxy!DEV", }; private const string SubscriptionExpression = @"\Galaxy!"; private static readonly TimeSpan PumpDuration = TimeSpan.FromSeconds(60); @@ -352,6 +359,8 @@ public sealed class AlarmClientWmProbeTests : IDisposable } Log($"Pump duration {PumpDuration.TotalSeconds:F0}s elapsed; deregistering."); + Log($"GetHighPriAlarm tally: ok-with-record={getHighPriOk} threw={getHighPriThrow} " + + $"(throws indicate alarm-record marshaling failure; ok=empty record)."); try { int dereg = client.DeregisterConsumer(); Log($"DeregisterConsumer -> {dereg}"); } catch (Exception ex) { Log($"DeregisterConsumer threw: {ex.GetType().Name}: {ex.Message}"); } @@ -375,6 +384,8 @@ public sealed class AlarmClientWmProbeTests : IDisposable private string lastProvidersSummary = string.Empty; private string lastHighPriSummary = string.Empty; private string lastSfStatsSummary = string.Empty; + private int getHighPriOk = 0; + private int getHighPriThrow = 0; /// /// Try every read API the AlarmClient exposes and log when its @@ -406,25 +417,32 @@ public sealed class AlarmClientWmProbeTests : IDisposable private void PollAllChannels(AlarmClient client, int seq) { - // Channel A: GetHighPriAlarm — direct peek of highest-priority alarm. + // Channel A: GetHighPriAlarm — peek highest-priority alarm. Track + // outcome state (record/empty/throw) and log every transition AND + // total counts at end. The throw correlates with an alarm being + // present (AVEVA fills timestamps with sentinel FILETIME values + // that crash the .NET marshaler) — useful as a presence signal + // even if we can't read the record. try { AlarmRecord rec = NewAlarmRecord(); int rc = client.GetHighPriAlarm(ref rec); string desc = rc == 0 ? DescribeAlarmRecord(rec) : ""; string summary = $"rc={rc} {desc}"; + getHighPriOk++; if (summary != lastHighPriSummary) { - Log($"GetHighPriAlarm #{seq}: {summary} (changed)"); + Log($"GetHighPriAlarm #{seq}: {summary} (changed; ok={getHighPriOk}, throw={getHighPriThrow})"); lastHighPriSummary = summary; } } catch (Exception ex) { - string es = $"{ex.GetType().Name}: {ex.Message}"; + string es = $"{ex.GetType().Name}"; + getHighPriThrow++; if (es != lastHighPriSummary) { - Log($"GetHighPriAlarm #{seq}: threw {es}"); + Log($"GetHighPriAlarm #{seq}: threw {es} (changed; ok={getHighPriOk}, throw={getHighPriThrow})"); lastHighPriSummary = es; } }