Code-review 2026-05-20 sweep: re-review at 1cd51bb, resolve 72 findings across all 11 modules
Re-reviewed every module/client against the 10-category checklist
(REVIEW-PROCESS.md) at commit 1cd51bb, filed 72 new findings, and
fixed them in three priority waves (3 High, 17 Medium, 52 Low).
Highs
- Server-017: enumerate AcknowledgeAlarm / QueryActiveAlarms in
GatewayGrpcScopeResolver so non-admin keys can use them; document
the mapping in docs/Authorization.md; add interceptor tests.
- Client.Java-013: add the five missing bulk-method stubs to the
CLI FakeSession so the test module compiles on a clean tree.
- Client.Rust-013: fix the clippy::doc_lazy_continuation regression
in generated tonic code by reformatting the ReadBulkCommand proto
comment and scoping a #![allow(...)] to the generated submodules.
Mediums (highlights)
- Server: unify GatewaySession state-lock discipline (-015) and
make DisposeAsync race-safe against in-flight CloseAsync (-016);
add constraint-enforcement test coverage for the bulk-plan path
(-021).
- Worker: introduce StaRuntimeShutdownException so RunAlarmPollLoop
can distinguish graceful shutdown from a real STA-affinity
violation (-016); have the watchdog skip StaHung while
CurrentCommandCorrelationId is non-empty so a legitimate slow
ReadBulk no longer self-faults (-017).
- Tests: add per-method round-trip + cancellation coverage for the
11 GatewaySession bulk methods (-013); replace the real TCP probe
in GalaxyHierarchyCacheTests with an IGalaxyRepository fake
(-016).
- IntegrationTests: drive the StreamEvents writer in the live Write
test and assert OnWriteComplete (-012); add live tests for
Unadvise/RemoveItem/Unregister ordering, WriteSecured, and
abnormal worker exit (-014).
- Worker.Tests: replace MxAccessSession reflection with an internal
CreateForTesting factory (-016); cover WorkerCancel and
unexpected-body envelope branches (-017).
- Client.Java: cancel MxEventStream when close() races
beforeStart() (-014); return a CancellingCompletableFuture that
actually forwards cancellation through .thenApply chains (-015).
- Client.Python: drop the silent localhost-plaintext downgrade in
the CLI; require explicit --plaintext (-013).
- Client.Rust: stop bench-read-bulk from polluting success-latency
histograms with failed-call durations (-015); add coverage for
the five MalformedReply paths, the bulk-write helpers, the
Error::Unavailable mapping, and the unary-fault path (-016).
- Contracts: extend docs/Contracts.md with the bulk read/write
command family (-009).
Lows (highlights)
- Server: cap GalaxyGlobMatcher.RegexCache; align
WorkerAlarmRpcDispatcher missing-session handling; drop the
duplicate dashboard @page routes; refresh IAlarmRpcDispatcher
XML doc.
- Worker: surface SetXmlAlarmQuery COM failures; remove dead
subscriptionExpression / ExecutingCommand arms; preserve
factory-supplied runtime sessions; split MxAlarmSnapshot.cs into
three files.
- Tests: dispose the WebApplication in seven test classes; rebuild
FakeWorkerProcess.WaitForExitAsync against a real TaskCompletion
source; switch the heartbeat-expires test to ManualTimeProvider;
add InvariantCulture to the remaining DateTimeOffset.Parse sites;
document GalaxyFilterInputSafetyTests in GatewayTesting.md.
- IntegrationTests: comment fixes, RecordingServerStreamWriter
IDisposable, class-level [Trait], single-source ZB default
connection string.
- Worker.Tests: replace silent-return gating with LiveMxAccessFact
so absent env vars SKIP not pass; PascalCase rename of probe
[Fact]s; deterministic deadline test; new frame-protocol error
tests; ComputeTransitions diff-coverage; relocate dev-rig probes
to Probes/.
- Contracts: add round-trip coverage and per-field redaction /
Galaxy-identifier comments to the protos.
- Client.Dotnet: introduce clients/dotnet/Directory.Build.props so
TreatWarningsAsErrors / analysers apply; document
DiscoverHierarchyOptions and IMxGatewayCliClient; require typed
bulk-read handles in CLI; surface AcknowledgeAlarm transport
faults through Translate().
- Client.Go: kill dead code in alarms_test / fakeGalaxyServer /
runWriteBulkVariant; document the six new subcommands in
writeUsage; drain galaxy-watch events on limit; switch io.EOF
comparisons to errors.Is.
- Client.Java: shared shutdown helpers + new shutdownTimeout
option; regex-based credential redaction; Long.toUnsignedString
for uint64 sequence; doc fixes.
- Client.Python: combine duplicate imports; add coverage for
_percentile / bench-read-bulk / MAX_AGGREGATE_EVENTS /
_api_key_from_env; populate pyproject metadata and ship py.typed.
- Client.Rust: expose next_correlation_id() so CLI ping/close
stop hard-coding correlation IDs; resync RustClientDesign.md
with the current Session / Error surface and CLI subcommand set.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,270 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Threading;
|
||||
using MxGateway.Contracts.Proto;
|
||||
using MxGateway.Worker.MxAccess;
|
||||
using Xunit.Abstractions;
|
||||
|
||||
namespace MxGateway.Worker.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Live dev-rig smoke test for the alarms-over-gateway pipeline.
|
||||
/// Exercises <see cref="WnWrapAlarmConsumer"/> + <see cref="AlarmDispatcher"/> +
|
||||
/// <see cref="MxAccessAlarmEventSink"/> end-to-end against the actual
|
||||
/// AVEVA System Platform install: subscribes to
|
||||
/// <c>\\<machine>\Galaxy!DEV</c>, waits for at least one alarm
|
||||
/// transition (the dev rig's flip script writes
|
||||
/// <c>TestMachine_001.TestAlarm001</c> every 10s), drains the proto
|
||||
/// <c>OnAlarmTransitionEvent</c> from the queue, then ack-by-name's
|
||||
/// it and verifies the ack registers as a subsequent
|
||||
/// <see cref="AlarmTransitionKind.Acknowledge"/> transition.
|
||||
///
|
||||
/// Skip-gated; flip <c>Skip=null</c> on the dev rig with the flip
|
||||
/// script running.
|
||||
/// </summary>
|
||||
public sealed class AlarmsLiveSmokeTests
|
||||
{
|
||||
private static readonly string SubscriptionExpression =
|
||||
$@"\\{Environment.MachineName}\Galaxy!DEV";
|
||||
private static readonly TimeSpan PumpDuration = TimeSpan.FromSeconds(45);
|
||||
private static readonly TimeSpan TransitionWaitTimeout = TimeSpan.FromSeconds(20);
|
||||
|
||||
private const string SessionId = "alarms-live-smoke";
|
||||
|
||||
private readonly ITestOutputHelper output;
|
||||
private readonly Stopwatch elapsed = Stopwatch.StartNew();
|
||||
private readonly ConcurrentQueue<string> log = new ConcurrentQueue<string>();
|
||||
|
||||
public AlarmsLiveSmokeTests(ITestOutputHelper output)
|
||||
{
|
||||
this.output = output;
|
||||
}
|
||||
|
||||
[Fact(Skip = "Live dev-rig smoke test — flip Skip=null with AVEVA + the alarm flip script running. Verified working 2026-05-01.")]
|
||||
public void Alarms_FullPipelineRoundTrip_RaisesAndAcknowledges()
|
||||
{
|
||||
Exception? threadException = null;
|
||||
var done = new ManualResetEventSlim(false);
|
||||
var thread = new Thread(() =>
|
||||
{
|
||||
try { RunSmoke(); }
|
||||
catch (Exception ex) { threadException = ex; }
|
||||
finally { done.Set(); }
|
||||
});
|
||||
thread.IsBackground = false;
|
||||
thread.SetApartmentState(ApartmentState.STA);
|
||||
thread.Start();
|
||||
done.Wait();
|
||||
thread.Join();
|
||||
|
||||
output.WriteLine($"Captured {log.Count} log line(s):");
|
||||
while (log.TryDequeue(out string? line))
|
||||
{
|
||||
output.WriteLine(line);
|
||||
}
|
||||
|
||||
if (threadException != null)
|
||||
{
|
||||
throw threadException;
|
||||
}
|
||||
}
|
||||
|
||||
private void RunSmoke()
|
||||
{
|
||||
Log($"Subscription expression: {SubscriptionExpression}");
|
||||
Log($"Pump duration: {PumpDuration.TotalSeconds:F0}s; transition wait timeout: {TransitionWaitTimeout.TotalSeconds:F0}s");
|
||||
|
||||
MxAccessEventQueue queue = new MxAccessEventQueue();
|
||||
// The consumer owns no internal timer; we drive PollOnce manually
|
||||
// from the STA below (the wnwrap COM is ThreadingModel=Apartment,
|
||||
// and this test doesn't run a Win32 message pump on its STA).
|
||||
WnWrapAlarmConsumer consumer = new WnWrapAlarmConsumer(
|
||||
new WNWRAPCONSUMERLib.wwAlarmConsumerClass(),
|
||||
maxAlarmsPerFetch: 1024);
|
||||
MxAccessAlarmEventSink sink = new MxAccessAlarmEventSink(queue, new MxAccessEventMapper());
|
||||
using AlarmDispatcher dispatcher = new AlarmDispatcher(consumer, sink, SessionId);
|
||||
|
||||
Log("Constructed consumer + sink + dispatcher.");
|
||||
dispatcher.Subscribe(SubscriptionExpression);
|
||||
Log("Subscribe -> ok. Driving PollOnce manually from this STA...");
|
||||
|
||||
// The wnwrap COM object is ThreadingModel=Apartment. The consumer
|
||||
// owns no internal timer, so we drive PollOnce manually here on the
|
||||
// STA. Production hosting routes polls through the worker's
|
||||
// StaRuntime.
|
||||
|
||||
// 1. Wait for the first transition (any kind), then keep waiting
|
||||
// for one with kind=Raise so the alarm is currently Active when
|
||||
// we try to ack. AVEVA rejects acks of cleared alarms with -55,
|
||||
// so we have to time the ack against the flip script's 10s
|
||||
// cadence.
|
||||
OnAlarmTransitionEvent? raiseBody = null;
|
||||
DateTime raiseDeadline = DateTime.UtcNow + TimeSpan.FromSeconds(30);
|
||||
while (DateTime.UtcNow < raiseDeadline && raiseBody is null)
|
||||
{
|
||||
WorkerEvent? evt = WaitForTransition(queue, TransitionWaitTimeout, "raise", consumer);
|
||||
if (evt is null) break;
|
||||
OnAlarmTransitionEvent body = evt.Event.OnAlarmTransition;
|
||||
Log("Transition: " + DescribeTransition(body));
|
||||
Assert.Equal(SessionId, evt.Event.SessionId);
|
||||
if (body.TransitionKind == AlarmTransitionKind.Raise)
|
||||
{
|
||||
raiseBody = body;
|
||||
}
|
||||
}
|
||||
Assert.NotNull(raiseBody);
|
||||
Assert.False(string.IsNullOrEmpty(raiseBody!.AlarmFullReference));
|
||||
Assert.Contains("Galaxy", raiseBody.AlarmFullReference);
|
||||
|
||||
// 2. Snapshot the active set + verify the captured alarm is there.
|
||||
var snapshot = dispatcher.SnapshotActiveAlarms();
|
||||
Log($"SnapshotActiveAlarms count={snapshot.Count}");
|
||||
foreach (var s in snapshot)
|
||||
{
|
||||
Log(" active: " + DescribeSnapshot(s));
|
||||
}
|
||||
Assert.NotEmpty(snapshot);
|
||||
Assert.Contains(snapshot, s => s.AlarmFullReference == raiseBody.AlarmFullReference);
|
||||
|
||||
// 3. Ack-by-name using the captured reference. Parse the reference
|
||||
// via the same convention the gateway dispatcher uses
|
||||
// (Provider!Group.Tag where the tag may contain dots).
|
||||
Assert.True(TryParseReference(
|
||||
raiseBody.AlarmFullReference,
|
||||
out string provider, out string group, out string alarmName),
|
||||
$"Captured reference '{raiseBody.AlarmFullReference}' did not parse as Provider!Group.Tag.");
|
||||
Log($"Ack target: provider='{provider}' group='{group}' name='{alarmName}'");
|
||||
|
||||
// Try the ack with real Windows identity. AVEVA's AlarmAckByName
|
||||
// may reject synthetic operator strings; using the current process
|
||||
// identity gives the alarm-history a recognizable principal.
|
||||
string realUser = Environment.UserName;
|
||||
string realNode = Environment.MachineName;
|
||||
string realDomain = Environment.UserDomainName ?? string.Empty;
|
||||
Log($"Ack identity: user='{realUser}' node='{realNode}' domain='{realDomain}'");
|
||||
|
||||
int rc = dispatcher.AcknowledgeByName(
|
||||
alarmName: alarmName,
|
||||
providerName: provider,
|
||||
groupName: group,
|
||||
ackComment: "alarms-live-smoke ack",
|
||||
ackOperatorName: realUser,
|
||||
ackOperatorNode: realNode,
|
||||
ackOperatorDomain: realDomain,
|
||||
ackOperatorFullName: realUser);
|
||||
Log($"AcknowledgeByName(real identity) -> rc={rc}");
|
||||
|
||||
Assert.Equal(0, rc);
|
||||
|
||||
// 4. Wait for the post-ack transition. With the alarm flipping every
|
||||
// 10s and the consumer polling every 500ms, the next state
|
||||
// change should be either kind=Acknowledge (the ack we just
|
||||
// sent registered as a state delta UnackAlm → AckAlm) or the
|
||||
// flip script's next Clear (UnackAlm → UnackRtn).
|
||||
WorkerEvent? second = WaitForTransition(queue, TransitionWaitTimeout, "post-ack", consumer);
|
||||
Assert.NotNull(second);
|
||||
OnAlarmTransitionEvent secondBody = second!.Event.OnAlarmTransition;
|
||||
Log("Post-ack transition: " + DescribeTransition(secondBody));
|
||||
Assert.NotEqual(AlarmTransitionKind.Unspecified, secondBody.TransitionKind);
|
||||
|
||||
// 5. Pump a little longer to confirm the consumer keeps reporting
|
||||
// transitions on the 10s flip cadence.
|
||||
DateTime deadline = DateTime.UtcNow + PumpDuration;
|
||||
int additional = 0;
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
consumer.PollOnce();
|
||||
if (queue.TryDequeue(out WorkerEvent? evt) && evt is not null)
|
||||
{
|
||||
additional++;
|
||||
OnAlarmTransitionEvent body = evt.Event.OnAlarmTransition;
|
||||
Log($" +{additional}: " + DescribeTransition(body));
|
||||
}
|
||||
Thread.Sleep(500);
|
||||
}
|
||||
Log($"Pump completed; additional transitions captured: {additional}.");
|
||||
}
|
||||
|
||||
private WorkerEvent? WaitForTransition(
|
||||
MxAccessEventQueue queue,
|
||||
TimeSpan timeout,
|
||||
string label,
|
||||
WnWrapAlarmConsumer consumer)
|
||||
{
|
||||
DateTime deadline = DateTime.UtcNow + timeout;
|
||||
int pollCount = 0;
|
||||
while (DateTime.UtcNow < deadline)
|
||||
{
|
||||
try
|
||||
{
|
||||
consumer.PollOnce();
|
||||
pollCount++;
|
||||
if (pollCount == 1) Log("First PollOnce returned without throw.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log($"PollOnce threw on poll #{pollCount + 1}: {ex.GetType().Name}: {ex.Message}");
|
||||
if (ex is System.Runtime.InteropServices.COMException ce)
|
||||
{
|
||||
Log($" HResult=0x{(uint)ce.HResult:X8}");
|
||||
}
|
||||
throw;
|
||||
}
|
||||
if (queue.TryDequeue(out WorkerEvent? evt) && evt is not null)
|
||||
{
|
||||
if (evt.Event.Family == MxEventFamily.OnAlarmTransition)
|
||||
{
|
||||
return evt;
|
||||
}
|
||||
Log($"Skipped non-alarm event (family={evt.Event.Family}) while waiting for {label}.");
|
||||
}
|
||||
Thread.Sleep(500);
|
||||
}
|
||||
Log($"Timed out waiting for {label} transition after {timeout.TotalSeconds:F0}s (poll count={pollCount}).");
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool TryParseReference(
|
||||
string reference,
|
||||
out string provider,
|
||||
out string group,
|
||||
out string alarmName)
|
||||
{
|
||||
provider = group = alarmName = string.Empty;
|
||||
if (string.IsNullOrWhiteSpace(reference)) return false;
|
||||
int bang = reference.IndexOf('!');
|
||||
if (bang <= 0 || bang == reference.Length - 1) return false;
|
||||
string left = reference.Substring(0, bang);
|
||||
string right = reference.Substring(bang + 1);
|
||||
int dot = right.IndexOf('.');
|
||||
if (dot <= 0 || dot == right.Length - 1) return false;
|
||||
provider = left;
|
||||
group = right.Substring(0, dot);
|
||||
alarmName = right.Substring(dot + 1);
|
||||
return true;
|
||||
}
|
||||
|
||||
private static string DescribeTransition(OnAlarmTransitionEvent body)
|
||||
{
|
||||
return string.Format(
|
||||
"kind={0} ref='{1}' source='{2}' type='{3}' severity={4} operator='{5}' comment='{6}' ts={7:o}",
|
||||
body.TransitionKind, body.AlarmFullReference, body.SourceObjectReference,
|
||||
body.AlarmTypeName, body.Severity, body.OperatorUser, body.OperatorComment,
|
||||
body.TransitionTimestamp?.ToDateTime() ?? DateTime.MinValue);
|
||||
}
|
||||
|
||||
private static string DescribeSnapshot(ActiveAlarmSnapshot s)
|
||||
{
|
||||
return string.Format(
|
||||
"ref='{0}' state={1} severity={2} operator='{3}' comment='{4}' ts={5:o}",
|
||||
s.AlarmFullReference, s.CurrentState, s.Severity, s.OperatorUser,
|
||||
s.OperatorComment,
|
||||
s.LastTransitionTimestamp?.ToDateTime() ?? DateTime.MinValue);
|
||||
}
|
||||
|
||||
private void Log(string line)
|
||||
{
|
||||
log.Enqueue($"[t={elapsed.Elapsed.TotalSeconds:F3}s] {line}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user