rename: prefix gateway projects/namespaces with ZB.MOM.WW + sln→slnx

Apply the ZB.MOM.WW. prefix to all gateway-side projects, folders,
.csproj/.sln contents, C# namespaces, using directives, generated proto
C# (csharp_namespace + checked-in generated files), InternalsVisibleTo
attributes, project-name string literals (LoadProject, .sln lookups,
worker exe paths, staticwebassets manifest), and the install/script/doc
references that point at any of the above. Migrate the solution from
.sln to .slnx via `dotnet sln migrate` and delete the old file.

External-runtime identifiers are intentionally NOT prefixed so external
configuration keeps working:
- GatewayMetrics.cs MeterName ("MxGateway.Server")
- DashboardAuthenticationDefaults Scheme/Policy ("MxGateway.Dashboard")
- GatewayRequestLoggingMiddleware logger category ("MxGateway.Request")
- StaRuntime thread name ("MxGateway.Worker.STA")
- appsettings.json root section "MxGateway" + env-var prefix
  MxGateway__... and secret-name MxGateway:ApiKeyPepper
- C:\ProgramData\MxGateway\ data dir paths

Also fixes two tests that were not rename-related but became visible
while validating the rename:

- WorkerLiveMxAccessSmokeTests.ShutDownAsync: cancellation that the
  gateway service correctly maps to RpcException(Cancelled) per gRPC
  convention was being misclassified as a stream fault. Added a sibling
  catch on RpcException with StatusCode.Cancelled.

- IntegrationTestEnvironment.ResolveRepositoryRoot: extracted IsRepositoryRoot
  and made it accept either a .git marker OR a .sln/.slnx next to src/
  so the worker-exe walker works in non-git working copies.

clients/proto/proto-inputs.json's protoRoot updated to point at
src/ZB.MOM.WW.MxGateway.Contracts/Protos.

Verified by `dotnet build` and a full `dotnet test` of the .slnx with
MXGATEWAY_RUN_LIVE_{MXACCESS,LDAP,GALAXY}_TESTS=1:
  Tests: 472/472 pass
  Worker.Tests: 280/280 pass (4 dev-rig [Fact(Skip=...)] skipped)
  IntegrationTests: 18/18 pass

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 16:22:23 -04:00
parent 867bf18116
commit dc9c0c950c
491 changed files with 32854 additions and 8414 deletions
@@ -0,0 +1,459 @@
using System.Collections.Concurrent;
using System.Diagnostics.Metrics;
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
public sealed class GatewayMetrics : IDisposable
{
public const string MeterName = "MxGateway.Server";
private readonly object _syncRoot = new();
private readonly Meter _meter;
private readonly Counter<long> _sessionsOpenedCounter;
private readonly Counter<long> _sessionsClosedCounter;
private readonly Counter<long> _commandsStartedCounter;
private readonly Counter<long> _commandsSucceededCounter;
private readonly Counter<long> _commandsFailedCounter;
private readonly Counter<long> _eventsReceivedCounter;
private readonly Counter<long> _queueOverflowsCounter;
private readonly Counter<long> _faultsCounter;
private readonly Counter<long> _workerKillsCounter;
private readonly Counter<long> _workerExitsCounter;
private readonly Counter<long> _heartbeatFailuresCounter;
private readonly Counter<long> _streamDisconnectsCounter;
private readonly Counter<long> _retryAttemptsCounter;
private readonly Histogram<double> _workerStartupLatencyHistogram;
private readonly Histogram<double> _commandLatencyHistogram;
private readonly Histogram<double> _eventStreamSendLatencyHistogram;
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
private readonly ConcurrentDictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly ConcurrentDictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
private int _openSessions;
private int _workersRunning;
private int _workerEventQueueDepth;
private int _grpcEventStreamQueueDepth;
private long _sessionsOpened;
private long _sessionsClosed;
private long _commandsStarted;
private long _commandsSucceeded;
private long _commandsFailed;
private long _eventsReceived;
private long _queueOverflows;
private long _faults;
private long _workerKills;
private long _workerExits;
private long _heartbeatFailures;
private long _streamDisconnects;
private long _retryAttempts;
private bool _disposed;
/// <summary>
/// Initializes the gateway metrics with OpenTelemetry counters and histograms.
/// </summary>
public GatewayMetrics()
{
_meter = new Meter(MeterName, typeof(GatewayMetrics).Assembly.GetName().Version?.ToString());
_sessionsOpenedCounter = _meter.CreateCounter<long>("mxgateway.sessions.opened");
_sessionsClosedCounter = _meter.CreateCounter<long>("mxgateway.sessions.closed");
_commandsStartedCounter = _meter.CreateCounter<long>("mxgateway.commands.started");
_commandsSucceededCounter = _meter.CreateCounter<long>("mxgateway.commands.succeeded");
_commandsFailedCounter = _meter.CreateCounter<long>("mxgateway.commands.failed");
_eventsReceivedCounter = _meter.CreateCounter<long>("mxgateway.events.received");
_queueOverflowsCounter = _meter.CreateCounter<long>("mxgateway.queues.overflows");
_faultsCounter = _meter.CreateCounter<long>("mxgateway.faults");
_workerKillsCounter = _meter.CreateCounter<long>("mxgateway.workers.killed");
_workerExitsCounter = _meter.CreateCounter<long>("mxgateway.workers.exited");
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
_meter.CreateObservableGauge("mxgateway.events.worker_queue.depth", GetWorkerEventQueueDepth);
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
}
/// <summary>
/// Records that a session has been opened.
/// </summary>
public void SessionOpened()
{
lock (_syncRoot)
{
_openSessions++;
_sessionsOpened++;
}
_sessionsOpenedCounter.Add(1);
}
/// <summary>
/// Records that a session has been closed.
/// </summary>
public void SessionClosed()
{
lock (_syncRoot)
{
if (_openSessions > 0)
{
_openSessions--;
}
_sessionsClosed++;
}
_sessionsClosedCounter.Add(1);
}
/// <summary>
/// Records that a session has been removed from registry.
/// </summary>
public void SessionRemoved()
{
lock (_syncRoot)
{
if (_openSessions > 0)
{
_openSessions--;
}
}
}
/// <summary>
/// Records that a worker process has started and its startup latency.
/// </summary>
/// <param name="startupDuration">Duration elapsed while starting the worker.</param>
public void WorkerStarted(TimeSpan startupDuration)
{
lock (_syncRoot)
{
_workersRunning++;
}
_workerStartupLatencyHistogram.Record(startupDuration.TotalMilliseconds);
}
/// <summary>
/// Records that a worker process has stopped with the given reason.
/// </summary>
/// <param name="reason">Cause of the worker stopping.</param>
public void WorkerStopped(string reason)
{
lock (_syncRoot)
{
if (_workersRunning > 0)
{
_workersRunning--;
}
_workerExits++;
}
_workerExitsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
}
/// <summary>
/// Records that a worker process was killed with the given reason.
/// </summary>
/// <param name="reason">Cause of the worker termination.</param>
public void WorkerKilled(string reason)
{
lock (_syncRoot)
{
_workerKills++;
}
_workerKillsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
}
/// <summary>
/// Records that a command has started for the given method.
/// </summary>
/// <param name="method">Name of the command method.</param>
public void CommandStarted(string method)
{
lock (_syncRoot)
{
_commandsStarted++;
}
_commandsStartedCounter.Add(1, new KeyValuePair<string, object?>("method", method));
}
/// <summary>
/// Records that a command succeeded for the given method and duration.
/// </summary>
/// <param name="method">Name of the command method.</param>
/// <param name="duration">Elapsed time to complete the command.</param>
public void CommandSucceeded(string method, TimeSpan duration)
{
lock (_syncRoot)
{
_commandsSucceeded++;
}
KeyValuePair<string, object?> methodTag = new("method", method);
_commandsSucceededCounter.Add(1, methodTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag);
}
/// <summary>
/// Records that a command failed for the given method, category, and duration.
/// </summary>
/// <param name="method">Name of the command method.</param>
/// <param name="category">Classification of the failure.</param>
/// <param name="duration">Elapsed time before command failed.</param>
public void CommandFailed(string method, string category, TimeSpan duration)
{
lock (_syncRoot)
{
_commandsFailed++;
Increment(_commandFailuresByMethod, method);
}
KeyValuePair<string, object?> methodTag = new("method", method);
KeyValuePair<string, object?> categoryTag = new("category", category);
_commandsFailedCounter.Add(1, methodTag, categoryTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag, categoryTag);
}
/// <summary>
/// Records that an event was received for the given session and family.
/// </summary>
/// <param name="sessionId">Identifier of the session receiving the event.</param>
/// <param name="family">Event family classification.</param>
public void EventReceived(string sessionId, string family)
{
Interlocked.Increment(ref _eventsReceived);
Increment(_eventsByFamily, family);
Increment(_eventsBySession, sessionId);
_eventsReceivedCounter.Add(
1,
new KeyValuePair<string, object?>("family", family));
}
/// <summary>
/// Records the latency of sending an event to a client stream.
/// </summary>
/// <param name="family">Event family name.</param>
/// <param name="duration">Time taken to send the event.</param>
public void RecordEventStreamSend(string family, TimeSpan duration)
{
_eventStreamSendLatencyHistogram.Record(
duration.TotalMilliseconds,
new KeyValuePair<string, object?>("family", family));
}
/// <summary>
/// Sets the worker event queue depth; delegates to SetWorkerEventQueueDepth.
/// </summary>
/// <param name="depth">Queue depth value.</param>
public void SetEventQueueDepth(int depth)
{
SetWorkerEventQueueDepth(depth);
}
/// <summary>
/// Sets the worker event queue depth to the given value.
/// </summary>
/// <param name="depth">Queue depth value.</param>
public void SetWorkerEventQueueDepth(int depth)
{
if (depth < 0)
{
throw new ArgumentOutOfRangeException(nameof(depth), depth, "Queue depth cannot be negative.");
}
lock (_syncRoot)
{
_workerEventQueueDepth = depth;
}
}
/// <summary>
/// Adjusts the gRPC event stream queue depth by the given delta.
/// </summary>
/// <param name="delta">Amount to adjust the queue depth by.</param>
public void AdjustGrpcEventStreamQueueDepth(int delta)
{
lock (_syncRoot)
{
_grpcEventStreamQueueDepth = Math.Max(0, _grpcEventStreamQueueDepth + delta);
}
}
/// <summary>
/// Removes event counters for the given session.
/// </summary>
/// <param name="sessionId">Identifier of the session.</param>
public void RemoveSessionEvents(string sessionId)
{
_eventsBySession.TryRemove(sessionId, out _);
}
/// <summary>
/// Records that a queue overflow occurred for the given queue name.
/// </summary>
/// <param name="queueName">Name of the queue that overflowed.</param>
public void QueueOverflow(string queueName)
{
lock (_syncRoot)
{
_queueOverflows++;
}
_queueOverflowsCounter.Add(1, new KeyValuePair<string, object?>("queue", queueName));
}
/// <summary>
/// Records that a fault occurred in the given category.
/// </summary>
/// <param name="category">Category of the fault.</param>
public void Fault(string category)
{
lock (_syncRoot)
{
_faults++;
}
_faultsCounter.Add(1, new KeyValuePair<string, object?>("category", category));
}
/// <summary>
/// Records that a heartbeat failed for the given session.
/// </summary>
/// <param name="sessionId">Identifier of the session.</param>
public void HeartbeatFailed(string sessionId)
{
lock (_syncRoot)
{
_heartbeatFailures++;
}
_heartbeatFailuresCounter.Add(1, new KeyValuePair<string, object?>("session_id", sessionId));
}
/// <summary>
/// Records that an event stream was disconnected with the given reason.
/// </summary>
/// <param name="reason">Reason for the disconnection.</param>
public void StreamDisconnected(string reason)
{
lock (_syncRoot)
{
_streamDisconnects++;
}
_streamDisconnectsCounter.Add(1, new KeyValuePair<string, object?>("reason", reason));
}
/// <summary>
/// Records that a retry was attempted in the given area.
/// </summary>
/// <param name="area">Area in which the retry was attempted.</param>
public void RetryAttempted(string area)
{
lock (_syncRoot)
{
_retryAttempts++;
Increment(_retryAttemptsByArea, area);
}
_retryAttemptsCounter.Add(1, new KeyValuePair<string, object?>("area", area));
}
/// <summary>
/// Returns a snapshot of all current metric values.
/// </summary>
public GatewayMetricsSnapshot GetSnapshot()
{
lock (_syncRoot)
{
return new GatewayMetricsSnapshot(
OpenSessions: _openSessions,
WorkersRunning: _workersRunning,
WorkerEventQueueDepth: _workerEventQueueDepth,
GrpcEventStreamQueueDepth: _grpcEventStreamQueueDepth,
SessionsOpened: _sessionsOpened,
SessionsClosed: _sessionsClosed,
CommandsStarted: _commandsStarted,
CommandsSucceeded: _commandsSucceeded,
CommandsFailed: _commandsFailed,
EventsReceived: Interlocked.Read(ref _eventsReceived),
QueueOverflows: _queueOverflows,
Faults: _faults,
WorkerKills: _workerKills,
WorkerExits: _workerExits,
HeartbeatFailures: _heartbeatFailures,
StreamDisconnects: _streamDisconnects,
RetryAttempts: _retryAttempts,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
}
}
/// <summary>
/// Disposes the underlying OpenTelemetry meter.
/// </summary>
public void Dispose()
{
if (_disposed)
{
return;
}
_meter.Dispose();
_disposed = true;
}
private int GetOpenSessions()
{
lock (_syncRoot)
{
return _openSessions;
}
}
private int GetWorkersRunning()
{
lock (_syncRoot)
{
return _workersRunning;
}
}
private int GetWorkerEventQueueDepth()
{
lock (_syncRoot)
{
return _workerEventQueueDepth;
}
}
private int GetGrpcEventStreamQueueDepth()
{
lock (_syncRoot)
{
return _grpcEventStreamQueueDepth;
}
}
private static void Increment(Dictionary<string, long> values, string key)
{
values.TryGetValue(key, out long currentValue);
values[key] = currentValue + 1;
}
private static void Increment(ConcurrentDictionary<string, long> values, string key)
{
values.AddOrUpdate(key, 1, static (_, currentValue) => currentValue + 1);
}
}
@@ -0,0 +1,24 @@
namespace ZB.MOM.WW.MxGateway.Server.Metrics;
public sealed record GatewayMetricsSnapshot(
int OpenSessions,
int WorkersRunning,
int WorkerEventQueueDepth,
int GrpcEventStreamQueueDepth,
long SessionsOpened,
long SessionsClosed,
long CommandsStarted,
long CommandsSucceeded,
long CommandsFailed,
long EventsReceived,
long QueueOverflows,
long Faults,
long WorkerKills,
long WorkerExits,
long HeartbeatFailures,
long StreamDisconnects,
long RetryAttempts,
IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession,
IReadOnlyDictionary<string, long> RetryAttemptsByArea);