rename: prefix gateway projects/namespaces with ZB.MOM.WW + sln→slnx

Apply the ZB.MOM.WW. prefix to all gateway-side projects, folders,
.csproj/.sln contents, C# namespaces, using directives, generated proto
C# (csharp_namespace + checked-in generated files), InternalsVisibleTo
attributes, project-name string literals (LoadProject, .sln lookups,
worker exe paths, staticwebassets manifest), and the install/script/doc
references that point at any of the above. Migrate the solution from
.sln to .slnx via `dotnet sln migrate` and delete the old file.

External-runtime identifiers are intentionally NOT prefixed so external
configuration keeps working:
- GatewayMetrics.cs MeterName ("MxGateway.Server")
- DashboardAuthenticationDefaults Scheme/Policy ("MxGateway.Dashboard")
- GatewayRequestLoggingMiddleware logger category ("MxGateway.Request")
- StaRuntime thread name ("MxGateway.Worker.STA")
- appsettings.json root section "MxGateway" + env-var prefix
  MxGateway__... and secret-name MxGateway:ApiKeyPepper
- C:\ProgramData\MxGateway\ data dir paths

Also fixes two tests that were not rename-related but became visible
while validating the rename:

- WorkerLiveMxAccessSmokeTests.ShutDownAsync: cancellation that the
  gateway service correctly maps to RpcException(Cancelled) per gRPC
  convention was being misclassified as a stream fault. Added a sibling
  catch on RpcException with StatusCode.Cancelled.

- IntegrationTestEnvironment.ResolveRepositoryRoot: extracted IsRepositoryRoot
  and made it accept either a .git marker OR a .sln/.slnx next to src/
  so the worker-exe walker works in non-git working copies.

clients/proto/proto-inputs.json's protoRoot updated to point at
src/ZB.MOM.WW.MxGateway.Contracts/Protos.

Verified by `dotnet build` and a full `dotnet test` of the .slnx with
MXGATEWAY_RUN_LIVE_{MXACCESS,LDAP,GALAXY}_TESTS=1:
  Tests: 472/472 pass
  Worker.Tests: 280/280 pass (4 dev-rig [Fact(Skip=...)] skipped)
  IntegrationTests: 18/18 pass

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 16:22:23 -04:00
parent 867bf18116
commit dc9c0c950c
491 changed files with 32854 additions and 8414 deletions
@@ -0,0 +1,29 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Abstraction over OS process enumeration and termination. Exists so the
/// orphan-worker cleanup logic can be unit-tested without spawning real
/// processes.
/// </summary>
public interface IRunningProcessInspector
{
/// <summary>
/// Enumerates currently running processes whose image name (without the
/// <c>.exe</c> extension) matches <paramref name="processName"/>.
/// </summary>
/// <param name="processName">Process image name to match, without extension.</param>
/// <returns>The matching running processes.</returns>
IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName);
/// <summary>Forcibly terminates the process with the given identifier.</summary>
/// <param name="processId">Identifier of the process to terminate.</param>
void Kill(int processId);
}
/// <summary>Identifying information for a running process candidate.</summary>
/// <param name="ProcessId">Operating-system process identifier.</param>
/// <param name="ExecutablePath">
/// Fully-qualified path to the process main module, or <see langword="null"/>
/// when it could not be read (e.g. access denied).
/// </param>
public sealed record RunningProcessInfo(int ProcessId, string? ExecutablePath);
@@ -0,0 +1,45 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>Manages communication with a single worker process via a named pipe.</summary>
public interface IWorkerClient : IAsyncDisposable
{
/// <summary>Unique session identifier for this worker.</summary>
string SessionId { get; }
/// <summary>Process ID of the worker, or null before handshake completes.</summary>
int? ProcessId { get; }
/// <summary>Current state of the worker connection.</summary>
WorkerClientState State { get; }
/// <summary>UTC timestamp of the most recent heartbeat from the worker.</summary>
DateTimeOffset LastHeartbeatAt { get; }
/// <summary>Initiates the handshake and enters ready state.</summary>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
Task StartAsync(CancellationToken cancellationToken);
/// <summary>Sends a command to the worker and waits for a reply.</summary>
/// <param name="command">Worker command to invoke.</param>
/// <param name="timeout">Timeout for waiting for the reply.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
Task<WorkerCommandReply> InvokeAsync(
WorkerCommand command,
TimeSpan timeout,
CancellationToken cancellationToken);
/// <summary>Reads events from the worker as they arrive.</summary>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
IAsyncEnumerable<WorkerEvent> ReadEventsAsync(CancellationToken cancellationToken);
/// <summary>Gracefully shuts down the worker by closing the connection.</summary>
/// <param name="timeout">Timeout for shutdown.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken);
/// <summary>Terminates the worker process immediately with a diagnostic reason.</summary>
/// <param name="reason">Reason for terminating the worker.</param>
void Kill(string reason);
}
@@ -0,0 +1,34 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Abstraction over a worker process with lifecycle and exit-code operations.
/// </summary>
public interface IWorkerProcess : IDisposable
{
/// <summary>
/// The process ID.
/// </summary>
int Id { get; }
/// <summary>
/// Indicates whether the process has exited.
/// </summary>
bool HasExited { get; }
/// <summary>
/// The exit code if the process has exited; otherwise null.
/// </summary>
int? ExitCode { get; }
/// <summary>
/// Waits for the process to exit with the specified cancellation token.
/// </summary>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
ValueTask WaitForExitAsync(CancellationToken cancellationToken);
/// <summary>
/// Kills the process, optionally terminating the entire process tree.
/// </summary>
/// <param name="entireProcessTree">If true, terminate all child processes; otherwise terminate only this process.</param>
void Kill(bool entireProcessTree);
}
@@ -0,0 +1,12 @@
using System.Diagnostics;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>Factory for creating and starting worker processes.</summary>
public interface IWorkerProcessFactory
{
/// <summary>Starts a worker process with the specified start information.</summary>
/// <param name="startInfo">Process start configuration.</param>
/// <returns>The started worker process.</returns>
IWorkerProcess Start(ProcessStartInfo startInfo);
}
@@ -0,0 +1,12 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public interface IWorkerProcessLauncher
{
/// <summary>Launches a new worker process with the specified configuration.</summary>
/// <param name="request">The launch request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The worker process handle.</returns>
Task<WorkerProcessHandle> LaunchAsync(
WorkerProcessLaunchRequest request,
CancellationToken cancellationToken = default);
}
@@ -0,0 +1,16 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public interface IWorkerStartupProbe
{
/// <summary>
/// Waits for the worker process to reach a ready state asynchronously.
/// </summary>
/// <param name="process">Worker process to probe.</param>
/// <param name="request">Worker launch request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Completed task.</returns>
Task WaitUntilReadyAsync(
IWorkerProcess process,
WorkerProcessLaunchRequest request,
CancellationToken cancellationToken);
}
@@ -0,0 +1,30 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Hosted service that terminates leftover MXAccess worker processes once on
/// gateway startup, before the server begins accepting sessions.
/// </summary>
public sealed class OrphanWorkerCleanupHostedService(
OrphanWorkerTerminator terminator,
ILogger<OrphanWorkerCleanupHostedService> logger) : IHostedService
{
/// <inheritdoc />
public Task StartAsync(CancellationToken cancellationToken)
{
try
{
terminator.TerminateOrphans();
}
catch (Exception exception)
{
// Orphan cleanup is best-effort; a failure here must not prevent
// the gateway from starting.
logger.LogWarning(exception, "Orphan worker cleanup failed on startup.");
}
return Task.CompletedTask;
}
/// <inheritdoc />
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
}
@@ -0,0 +1,138 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Terminates leftover MXAccess worker processes on gateway startup.
/// <para>
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
/// workers. After an unclean gateway crash, x86 worker processes — each
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
/// terminator finds those processes by executable name/path and kills them
/// before the restarted gateway accepts sessions.
/// </para>
/// </summary>
public sealed class OrphanWorkerTerminator
{
private readonly IRunningProcessInspector _inspector;
private readonly GatewayMetrics _metrics;
private readonly WorkerOptions _workerOptions;
private readonly ILogger<OrphanWorkerTerminator> _logger;
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
/// <param name="gatewayOptions">Gateway configuration options.</param>
/// <param name="inspector">Running-process inspector.</param>
/// <param name="metrics">Gateway metrics collector.</param>
/// <param name="logger">Optional logger for diagnostic output.</param>
public OrphanWorkerTerminator(
IOptions<GatewayOptions> gatewayOptions,
IRunningProcessInspector inspector,
GatewayMetrics metrics,
ILogger<OrphanWorkerTerminator>? logger = null)
{
ArgumentNullException.ThrowIfNull(gatewayOptions);
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_workerOptions = gatewayOptions.Value.Worker;
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
}
/// <summary>
/// Finds and kills every leftover worker process. Safe to call once at
/// startup before any session-owned worker is launched.
/// </summary>
/// <returns>The number of orphan worker processes that were terminated.</returns>
public int TerminateOrphans()
{
string? configuredPath = ResolveConfiguredExecutablePath();
string processName = ResolveProcessName(configuredPath);
int currentProcessId = Environment.ProcessId;
int terminated = 0;
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
{
if (candidate.ProcessId == currentProcessId)
{
continue;
}
if (!IsOrphanWorker(candidate, configuredPath))
{
continue;
}
try
{
_inspector.Kill(candidate.ProcessId);
_metrics.WorkerKilled("OrphanStartupCleanup");
terminated++;
_logger.LogWarning(
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
candidate.ProcessId,
candidate.ExecutablePath ?? processName);
}
catch (Exception exception)
{
// The process may have already exited, or be inaccessible.
// A failure to kill one orphan must not block gateway startup.
_logger.LogWarning(
exception,
"Failed to terminate orphan worker process {ProcessId}.",
candidate.ProcessId);
}
}
if (terminated > 0)
{
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
}
return terminated;
}
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
{
// When the executable path is readable, require an exact match against
// the configured worker path so unrelated processes that merely share
// the image name are never killed.
if (candidate.ExecutablePath is { } path)
{
return configuredPath is not null
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
}
// A null path means the x64 gateway could not introspect the module —
// the expected case for the x86 worker. Image-name match is the only
// signal available; treat it as an orphan.
return true;
}
private string? ResolveConfiguredExecutablePath()
{
try
{
return Path.GetFullPath(_workerOptions.ExecutablePath);
}
catch (Exception exception) when (exception is ArgumentException
or NotSupportedException
or PathTooLongException)
{
_logger.LogWarning(
exception,
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
+ "orphan cleanup will match by image name only.",
_workerOptions.ExecutablePath);
return null;
}
}
private static string ResolveProcessName(string? configuredPath)
{
string source = configuredPath ?? "ZB.MOM.WW.MxGateway.Worker.exe";
return Path.GetFileNameWithoutExtension(source);
}
}
@@ -0,0 +1,55 @@
using System.Diagnostics;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// <see cref="IRunningProcessInspector"/> backed by <see cref="Process"/>.
/// </summary>
public sealed class SystemRunningProcessInspector : IRunningProcessInspector
{
/// <inheritdoc />
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName)
{
List<RunningProcessInfo> results = [];
Process[] processes = Process.GetProcessesByName(processName);
try
{
foreach (Process process in processes)
{
results.Add(new RunningProcessInfo(process.Id, TryGetExecutablePath(process)));
}
}
finally
{
foreach (Process process in processes)
{
process.Dispose();
}
}
return results;
}
/// <inheritdoc />
public void Kill(int processId)
{
using Process process = Process.GetProcessById(processId);
process.Kill(entireProcessTree: true);
}
private static string? TryGetExecutablePath(Process process)
{
try
{
return process.MainModule?.FileName;
}
catch (Exception exception) when (exception is InvalidOperationException
or System.ComponentModel.Win32Exception
or NotSupportedException)
{
// Access to the main module can be denied (e.g. a 64-bit gateway
// querying a 32-bit worker, or a process owned by another user).
return null;
}
}
}
@@ -0,0 +1,36 @@
using System.Diagnostics;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Wraps a System.Diagnostics.Process as an IWorkerProcess.
/// </summary>
internal sealed class SystemWorkerProcess(Process process) : IWorkerProcess
{
/// <inheritdoc />
public int Id => process.Id;
/// <inheritdoc />
public bool HasExited => process.HasExited;
/// <inheritdoc />
public int? ExitCode => process.HasExited ? process.ExitCode : null;
/// <inheritdoc />
public async ValueTask WaitForExitAsync(CancellationToken cancellationToken)
{
await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
}
/// <inheritdoc />
public void Kill(bool entireProcessTree)
{
process.Kill(entireProcessTree);
}
/// <inheritdoc />
public void Dispose()
{
process.Dispose();
}
}
@@ -0,0 +1,26 @@
using System.Diagnostics;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Factory that creates system processes for workers.
/// </summary>
public sealed class SystemWorkerProcessFactory : IWorkerProcessFactory
{
/// <inheritdoc />
public IWorkerProcess Start(ProcessStartInfo startInfo)
{
Process process = new()
{
StartInfo = startInfo,
};
if (!process.Start())
{
process.Dispose();
throw new InvalidOperationException("Worker process failed to start.");
}
return new SystemWorkerProcess(process);
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,49 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public sealed class WorkerClientConnection
{
/// <summary>Initializes a new worker client connection.</summary>
/// <param name="sessionId">Identifier of the session.</param>
/// <param name="nonce">Worker handshake nonce.</param>
/// <param name="stream">Named pipe stream for IPC communication.</param>
/// <param name="frameOptions">Frame protocol serialization options.</param>
/// <param name="processHandle">Worker process handle, if available.</param>
public WorkerClientConnection(
string sessionId,
string nonce,
Stream stream,
WorkerFrameProtocolOptions frameOptions,
WorkerProcessHandle? processHandle = null)
{
if (string.IsNullOrWhiteSpace(sessionId))
{
throw new ArgumentException("Session id is required.", nameof(sessionId));
}
if (string.IsNullOrWhiteSpace(nonce))
{
throw new ArgumentException("Worker nonce is required.", nameof(nonce));
}
SessionId = sessionId;
Nonce = nonce;
Stream = stream ?? throw new ArgumentNullException(nameof(stream));
FrameOptions = frameOptions ?? throw new ArgumentNullException(nameof(frameOptions));
ProcessHandle = processHandle;
}
/// <summary>The session ID associated with this connection.</summary>
public string SessionId { get; }
/// <summary>The nonce used for handshaking with the worker.</summary>
public string Nonce { get; }
/// <summary>The named pipe stream for IPC communication.</summary>
public Stream Stream { get; }
/// <summary>The frame protocol options for serialization.</summary>
public WorkerFrameProtocolOptions FrameOptions { get; }
/// <summary>The worker process handle, if available.</summary>
public WorkerProcessHandle? ProcessHandle { get; }
}
@@ -0,0 +1,15 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public enum WorkerClientErrorCode
{
InvalidState,
ProtocolViolation,
PipeDisconnected,
CommandTimeout,
WorkerFaulted,
HeartbeatExpired,
ShutdownTimeout,
GatewayShutdown,
WriteFailed,
PendingCommandLimitExceeded,
}
@@ -0,0 +1,40 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Exception raised when communication with a worker process fails.
/// </summary>
public sealed class WorkerClientException : Exception
{
/// <summary>
/// Initializes with an error code and message.
/// </summary>
/// <param name="errorCode">Worker client error code classifying the failure.</param>
/// <param name="message">Diagnostic message.</param>
public WorkerClientException(
WorkerClientErrorCode errorCode,
string message)
: base(message)
{
ErrorCode = errorCode;
}
/// <summary>
/// Initializes with an error code, message, and inner exception.
/// </summary>
/// <param name="errorCode">Worker client error code classifying the failure.</param>
/// <param name="message">Diagnostic message.</param>
/// <param name="innerException">Underlying exception.</param>
public WorkerClientException(
WorkerClientErrorCode errorCode,
string message,
Exception innerException)
: base(message, innerException)
{
ErrorCode = errorCode;
}
/// <summary>
/// The worker client error code classifying the failure.
/// </summary>
public WorkerClientErrorCode ErrorCode { get; }
}
@@ -0,0 +1,68 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>Configurable options for worker client behavior.</summary>
public sealed class WorkerClientOptions
{
/// <summary>Default maximum age of a heartbeat before the client enters faulted state.</summary>
public static readonly TimeSpan DefaultHeartbeatGrace = TimeSpan.FromSeconds(15);
/// <summary>Default interval for checking heartbeat staleness.</summary>
public static readonly TimeSpan DefaultHeartbeatCheckInterval = TimeSpan.FromSeconds(1);
/// <summary>Default timeout when the event queue is full.</summary>
public static readonly TimeSpan DefaultEventChannelFullModeTimeout = TimeSpan.FromSeconds(5);
/// <summary>
/// Default ceiling on the in-flight-command heartbeat skip. Mirrors
/// <see cref="ZB.MOM.WW.MxGateway.Worker.Ipc.WorkerPipeSessionOptions.DefaultHeartbeatStuckCeiling"/>
/// on the worker side (Worker-023). When a command has been in flight
/// longer than this, the gateway-side heartbeat watchdog fires
/// regardless of pending commands — a truly stuck COM call shouldn't
/// hide the worker forever.
/// </summary>
public static readonly TimeSpan DefaultHeartbeatStuckCeiling = TimeSpan.FromSeconds(75);
/// <summary>Initializes options with default values.</summary>
public WorkerClientOptions()
{
HeartbeatGrace = DefaultHeartbeatGrace;
HeartbeatCheckInterval = DefaultHeartbeatCheckInterval;
EventChannelCapacity = 1_024;
EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout;
MaxPendingCommands = 128;
HeartbeatStuckCeiling = DefaultHeartbeatStuckCeiling;
}
/// <summary>Maximum allowed age of the last heartbeat before faulting the client.</summary>
public TimeSpan HeartbeatGrace { get; init; }
/// <summary>Interval at which to check for heartbeat expiration.</summary>
public TimeSpan HeartbeatCheckInterval { get; init; }
/// <summary>Maximum number of events buffered before backpressure is applied.</summary>
public int EventChannelCapacity { get; init; }
/// <summary>
/// Time to wait for the gateway-side event channel to drain before
/// faulting the worker. Honored by <c>EnqueueWorkerEventAsync</c> via
/// <c>WriteAsync</c>; with the channel configured for
/// <c>BoundedChannelFullMode.Wait</c>, a transient backlog only faults
/// after the configured timeout has elapsed (Server-032). Pre-Server-032
/// the field was declared but unused — overflow faulted immediately.
/// </summary>
public TimeSpan EventChannelFullModeTimeout { get; init; }
/// <summary>Maximum number of concurrent pending commands.</summary>
public int MaxPendingCommands { get; init; }
/// <summary>
/// Server-031: ceiling on the in-flight-command heartbeat-skip. When
/// a command has been pending on the gateway↔worker pipe for longer
/// than this, the gateway-side <c>HeartbeatLoopAsync</c> fires the
/// <c>HeartbeatExpired</c> fault even if commands are still pending;
/// a truly stuck COM call shouldn't keep the watchdog suppressed
/// indefinitely. Mirrors Worker-023's <c>HeartbeatStuckCeiling</c> on
/// the worker side.
/// </summary>
public TimeSpan HeartbeatStuckCeiling { get; init; }
}
@@ -0,0 +1,11 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public enum WorkerClientState
{
Created,
Handshaking,
Ready,
Closing,
Closed,
Faulted,
}
@@ -0,0 +1,40 @@
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Validates worker envelope messages against protocol expectations.
/// </summary>
internal static class WorkerEnvelopeValidator
{
/// <summary>
/// Validates a worker envelope for protocol compliance.
/// </summary>
/// <param name="envelope">The envelope to validate.</param>
/// <param name="options">The frame protocol configuration.</param>
public static void Validate(
WorkerEnvelope envelope,
WorkerFrameProtocolOptions options)
{
if (envelope.ProtocolVersion != options.ProtocolVersion)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.ProtocolVersionMismatch,
$"Worker envelope protocol version {envelope.ProtocolVersion} does not match expected version {options.ProtocolVersion}.");
}
if (!string.Equals(envelope.SessionId, options.SessionId, StringComparison.Ordinal))
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.SessionMismatch,
"Worker envelope session id does not match the owning gateway session.");
}
if (envelope.BodyCase == WorkerEnvelope.BodyOneofCase.None)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidEnvelope,
"Worker envelope must include a typed body.");
}
}
}
@@ -0,0 +1,87 @@
using System.Buffers.Binary;
using ZB.MOM.WW.MxGateway.Server.Configuration;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
internal static class WorkerExecutableValidator
{
private const ushort ImageFileMachineI386 = 0x014c;
private const ushort ImageFileMachineAmd64 = 0x8664;
private const int DosHeaderSignatureOffset = 0;
private const int PeHeaderOffsetPointer = 0x3c;
private const int PeSignatureSize = 4;
private const int MachineOffsetFromPeHeader = PeSignatureSize;
private const int MinimumHeaderSize = 0x40;
/// <summary>Validates that a worker executable file has the required architecture.</summary>
/// <param name="executablePath">Full path to the worker executable file.</param>
/// <param name="requiredArchitecture">Required CPU architecture (x86 or x64).</param>
/// <exception cref="WorkerProcessLaunchException">Thrown if the executable architecture does not match the required architecture.</exception>
public static void Validate(
string executablePath,
WorkerArchitecture requiredArchitecture)
{
ushort machine = ReadMachineType(executablePath);
ushort expectedMachine = requiredArchitecture switch
{
WorkerArchitecture.X86 => ImageFileMachineI386,
WorkerArchitecture.X64 => ImageFileMachineAmd64,
_ => throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidExecutable,
"Worker executable required architecture is unsupported."),
};
if (machine != expectedMachine)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidExecutable,
$"Worker executable architecture does not match required {requiredArchitecture} architecture.");
}
}
/// <summary>Reads the PE machine type from the executable header.</summary>
/// <param name="executablePath">Full path to the executable file.</param>
/// <returns>Machine type constant from PE header.</returns>
private static ushort ReadMachineType(string executablePath)
{
byte[] header = new byte[MinimumHeaderSize];
using FileStream stream = File.OpenRead(executablePath);
if (stream.Read(header) < header.Length)
{
throw InvalidExecutable("Worker executable is too small to contain a valid PE header.");
}
if (header[DosHeaderSignatureOffset] != 'M' || header[DosHeaderSignatureOffset + 1] != 'Z')
{
throw InvalidExecutable("Worker executable does not contain an MZ header.");
}
int peHeaderOffset = BinaryPrimitives.ReadInt32LittleEndian(header.AsSpan(PeHeaderOffsetPointer, sizeof(int)));
if (peHeaderOffset < MinimumHeaderSize)
{
throw InvalidExecutable("Worker executable PE header offset is invalid.");
}
byte[] peHeaderBytes = new byte[PeSignatureSize + sizeof(ushort)];
stream.Position = peHeaderOffset;
if (stream.Read(peHeaderBytes) < peHeaderBytes.Length)
{
throw InvalidExecutable("Worker executable PE header is missing.");
}
if (peHeaderBytes[0] != 'P' || peHeaderBytes[1] != 'E' || peHeaderBytes[2] != 0 || peHeaderBytes[3] != 0)
{
throw InvalidExecutable("Worker executable does not contain a PE header.");
}
return BinaryPrimitives.ReadUInt16LittleEndian(
peHeaderBytes.AsSpan(MachineOffsetFromPeHeader, sizeof(ushort)));
}
private static WorkerProcessLaunchException InvalidExecutable(string message)
{
return new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidExecutable,
message);
}
}
@@ -0,0 +1,13 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public enum WorkerFrameProtocolErrorCode
{
Unknown = 0,
InvalidConfiguration = 1,
EndOfStream = 2,
MalformedLength = 3,
MessageTooLarge = 4,
InvalidEnvelope = 5,
ProtocolVersionMismatch = 6,
SessionMismatch = 7,
}
@@ -0,0 +1,40 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Exception thrown when a worker frame protocol violation occurs.
/// </summary>
public sealed class WorkerFrameProtocolException : Exception
{
/// <summary>
/// Initializes a frame protocol exception with an error code and message.
/// </summary>
/// <param name="errorCode">Protocol error code indicating the violation type.</param>
/// <param name="message">Human-readable error message.</param>
public WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode errorCode,
string message)
: base(message)
{
ErrorCode = errorCode;
}
/// <summary>
/// Initializes a frame protocol exception with an error code, message, and inner exception.
/// </summary>
/// <param name="errorCode">Protocol error code indicating the violation type.</param>
/// <param name="message">Human-readable error message.</param>
/// <param name="innerException">Underlying exception that caused this protocol violation.</param>
public WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode errorCode,
string message,
Exception innerException)
: base(message, innerException)
{
ErrorCode = errorCode;
}
/// <summary>
/// Gets the worker frame protocol error code.
/// </summary>
public WorkerFrameProtocolErrorCode ErrorCode { get; }
}
@@ -0,0 +1,76 @@
using ZB.MOM.WW.MxGateway.Contracts;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Configuration for the worker frame protocol connection.
/// </summary>
public sealed class WorkerFrameProtocolOptions
{
/// <summary>Default maximum message size in bytes (16 MB).</summary>
public const int DefaultMaxMessageBytes = 16 * 1024 * 1024;
/// <summary>
/// Initializes worker frame protocol options with a session ID.
/// </summary>
/// <param name="sessionId">Identifier of the session.</param>
public WorkerFrameProtocolOptions(string sessionId)
: this(
sessionId,
GatewayContractInfo.WorkerProtocolVersion,
DefaultMaxMessageBytes)
{
}
/// <summary>
/// Initializes worker frame protocol options with all parameters.
/// </summary>
/// <param name="sessionId">Identifier of the session.</param>
/// <param name="protocolVersion">Protocol version number.</param>
/// <param name="maxMessageBytes">Maximum message size in bytes.</param>
public WorkerFrameProtocolOptions(
string sessionId,
uint protocolVersion,
int maxMessageBytes)
{
if (string.IsNullOrWhiteSpace(sessionId))
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidConfiguration,
"Worker frame protocol requires a session id.");
}
if (protocolVersion == 0)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidConfiguration,
"Worker frame protocol requires a non-zero protocol version.");
}
if (maxMessageBytes <= 0)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidConfiguration,
"Worker frame protocol max message size must be greater than zero.");
}
SessionId = sessionId;
ProtocolVersion = protocolVersion;
MaxMessageBytes = maxMessageBytes;
}
/// <summary>
/// Gets the session identifier.
/// </summary>
public string SessionId { get; }
/// <summary>
/// Gets the worker protocol version.
/// </summary>
public uint ProtocolVersion { get; }
/// <summary>
/// Gets the maximum message size in bytes.
/// </summary>
public int MaxMessageBytes { get; }
}
@@ -0,0 +1,87 @@
using System.Buffers.Binary;
using Google.Protobuf;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public sealed class WorkerFrameReader
{
private readonly WorkerFrameProtocolOptions _options;
private readonly Stream _stream;
/// <summary>
/// Initializes a new instance of <see cref="WorkerFrameReader"/>.
/// </summary>
/// <param name="stream">Stream to read frames from.</param>
/// <param name="options">Frame protocol options.</param>
public WorkerFrameReader(
Stream stream,
WorkerFrameProtocolOptions options)
{
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
_options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Reads a worker envelope frame from the stream asynchronously.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Parsed worker envelope.</returns>
public async ValueTask<WorkerEnvelope> ReadAsync(CancellationToken cancellationToken = default)
{
byte[] lengthPrefix = new byte[sizeof(uint)];
await ReadExactlyOrThrowAsync(lengthPrefix, cancellationToken).ConfigureAwait(false);
uint payloadLength = BinaryPrimitives.ReadUInt32LittleEndian(lengthPrefix);
if (payloadLength == 0)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.MalformedLength,
"Worker frame payload length must be greater than zero.");
}
if (payloadLength > _options.MaxMessageBytes)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.MessageTooLarge,
$"Worker frame payload length {payloadLength} exceeds the configured maximum of {_options.MaxMessageBytes} bytes.");
}
byte[] payload = new byte[payloadLength];
await ReadExactlyOrThrowAsync(payload, cancellationToken).ConfigureAwait(false);
WorkerEnvelope envelope;
try
{
envelope = WorkerEnvelope.Parser.ParseFrom(payload);
}
catch (InvalidProtocolBufferException exception)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidEnvelope,
"Worker frame payload is not a valid WorkerEnvelope protobuf message.",
exception);
}
WorkerEnvelopeValidator.Validate(envelope, _options);
return envelope;
}
private async ValueTask ReadExactlyOrThrowAsync(
Memory<byte> buffer,
CancellationToken cancellationToken)
{
try
{
await _stream.ReadExactlyAsync(buffer, cancellationToken).ConfigureAwait(false);
}
catch (EndOfStreamException exception)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.EndOfStream,
"Worker frame ended before the expected number of bytes were read.",
exception);
}
}
}
@@ -0,0 +1,61 @@
using System.Buffers.Binary;
using Google.Protobuf;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Writes length-prefixed WorkerEnvelope protobuf messages to a stream.
/// </summary>
public sealed class WorkerFrameWriter
{
private readonly WorkerFrameProtocolOptions _options;
private readonly Stream _stream;
/// <summary>
/// Initializes the writer with a stream and frame protocol options.
/// </summary>
/// <param name="stream">Stream to write frames to.</param>
/// <param name="options">Frame protocol configuration.</param>
public WorkerFrameWriter(
Stream stream,
WorkerFrameProtocolOptions options)
{
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
_options = options ?? throw new ArgumentNullException(nameof(options));
}
/// <summary>
/// Writes a WorkerEnvelope as a length-prefixed message to the stream.
/// </summary>
/// <param name="envelope">Worker envelope message to write.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
public async ValueTask WriteAsync(
WorkerEnvelope envelope,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(envelope);
WorkerEnvelopeValidator.Validate(envelope, _options);
int payloadLength = envelope.CalculateSize();
if (payloadLength == 0)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.InvalidEnvelope,
"Worker envelope cannot serialize to an empty payload.");
}
if (payloadLength > _options.MaxMessageBytes)
{
throw new WorkerFrameProtocolException(
WorkerFrameProtocolErrorCode.MessageTooLarge,
$"Worker envelope payload length {payloadLength} exceeds the configured maximum of {_options.MaxMessageBytes} bytes.");
}
byte[] lengthPrefix = new byte[sizeof(uint)];
BinaryPrimitives.WriteUInt32LittleEndian(lengthPrefix, (uint)payloadLength);
await _stream.WriteAsync(lengthPrefix, cancellationToken).ConfigureAwait(false);
await _stream.WriteAsync(envelope.ToByteArray(), cancellationToken).ConfigureAwait(false);
}
}
@@ -0,0 +1,45 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Represents a worker process command line.
/// </summary>
public sealed class WorkerProcessCommandLine
{
/// <summary>
/// Initializes a command line with executable path and arguments.
/// </summary>
/// <param name="executablePath">Path to the worker executable.</param>
/// <param name="arguments">Command-line arguments.</param>
public WorkerProcessCommandLine(
string executablePath,
IReadOnlyList<string> arguments)
{
ExecutablePath = executablePath;
Arguments = arguments;
}
/// <summary>
/// Gets the path to the worker executable.
/// </summary>
public string ExecutablePath { get; }
/// <summary>
/// Gets the command-line arguments.
/// </summary>
public IReadOnlyList<string> Arguments { get; }
/// <inheritdoc />
public override string ToString()
{
return string.Join(
" ",
new[] { Quote(ExecutablePath) }.Concat(Arguments.Select(Quote)));
}
private static string Quote(string value)
{
return value.Contains(' ', StringComparison.Ordinal)
? $"\"{value}\""
: value;
}
}
@@ -0,0 +1,38 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>Handle to a running worker process with metadata.</summary>
public sealed class WorkerProcessHandle : IDisposable
{
/// <summary>Initializes a new instance of the WorkerProcessHandle class.</summary>
/// <param name="process">The underlying worker process.</param>
/// <param name="commandLine">The command line and arguments used to launch the process.</param>
/// <param name="launchedAt">The time when the process was launched.</param>
public WorkerProcessHandle(
IWorkerProcess process,
WorkerProcessCommandLine commandLine,
DateTimeOffset launchedAt)
{
Process = process;
ProcessId = process.Id;
CommandLine = commandLine;
LaunchedAt = launchedAt;
}
/// <summary>Gets the underlying worker process.</summary>
public IWorkerProcess Process { get; }
/// <summary>Gets the process ID.</summary>
public int ProcessId { get; }
/// <summary>Gets the command line and arguments used to launch the process.</summary>
public WorkerProcessCommandLine CommandLine { get; }
/// <summary>Gets the time when the process was launched.</summary>
public DateTimeOffset LaunchedAt { get; }
/// <inheritdoc />
public void Dispose()
{
Process.Dispose();
}
}
@@ -0,0 +1,13 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public enum WorkerProcessLaunchErrorCode
{
Unknown = 0,
InvalidRequest = 1,
ExecutableNotFound = 2,
InvalidExecutable = 3,
InvalidWorkingDirectory = 4,
StartFailed = 5,
StartupTimeout = 6,
StartupFailed = 7,
}
@@ -0,0 +1,31 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public sealed class WorkerProcessLaunchException : Exception
{
/// <summary>Initializes a new instance of the <see cref="WorkerProcessLaunchException"/> class.</summary>
/// <param name="errorCode">Error code for the worker process launch failure.</param>
/// <param name="message">Diagnostic message.</param>
public WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode errorCode,
string message)
: base(message)
{
ErrorCode = errorCode;
}
/// <summary>Initializes a new instance of the <see cref="WorkerProcessLaunchException"/> class with an inner exception.</summary>
/// <param name="errorCode">Error code for the worker process launch failure.</param>
/// <param name="message">Diagnostic message.</param>
/// <param name="innerException">Underlying exception.</param>
public WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode errorCode,
string message,
Exception innerException)
: base(message, innerException)
{
ErrorCode = errorCode;
}
/// <summary>Gets the error code for the worker process launch failure.</summary>
public WorkerProcessLaunchErrorCode ErrorCode { get; }
}
@@ -0,0 +1,8 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public sealed record WorkerProcessLaunchRequest(
string SessionId,
string PipeName,
uint ProtocolVersion,
string Nonce,
IDisposable? PipeReservation = null);
@@ -0,0 +1,336 @@
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Metrics;
using Polly;
using Polly.Retry;
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>
/// Launches worker processes with startup probing and error handling.
/// </summary>
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
{
/// <summary>Environment variable for worker nonce.</summary>
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
/// <summary>Environment variable for worker pipe connect attempt timeout.</summary>
public const string WorkerPipeConnectAttemptTimeoutEnvironmentVariableName =
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
private readonly IWorkerProcessFactory _processFactory;
private readonly IWorkerStartupProbe _startupProbe;
private readonly GatewayMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly WorkerOptions _workerOptions;
private readonly ILogger<WorkerProcessLauncher> _logger;
/// <summary>
/// Initializes the worker process launcher with gateway options and dependencies.
/// </summary>
/// <param name="gatewayOptions">Gateway configuration options.</param>
/// <param name="processFactory">Factory for creating worker processes.</param>
/// <param name="startupProbe">Probe for checking worker startup completion.</param>
/// <param name="metrics">Gateway metrics collector.</param>
/// <param name="logger">Optional logger for diagnostic output.</param>
/// <param name="timeProvider">Optional time provider for timestamps.</param>
public WorkerProcessLauncher(
IOptions<GatewayOptions> gatewayOptions,
IWorkerProcessFactory processFactory,
IWorkerStartupProbe startupProbe,
GatewayMetrics metrics,
ILogger<WorkerProcessLauncher>? logger = null,
TimeProvider? timeProvider = null)
{
ArgumentNullException.ThrowIfNull(gatewayOptions);
ArgumentNullException.ThrowIfNull(processFactory);
ArgumentNullException.ThrowIfNull(startupProbe);
ArgumentNullException.ThrowIfNull(metrics);
_workerOptions = gatewayOptions.Value.Worker;
_processFactory = processFactory;
_startupProbe = startupProbe;
_metrics = metrics;
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<WorkerProcessLauncher>.Instance;
}
/// <summary>
/// Launches a worker process and waits for startup.
/// </summary>
/// <param name="request">Request payload.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
/// <returns>Handle to the launched worker process.</returns>
public async Task<WorkerProcessHandle> LaunchAsync(
WorkerProcessLaunchRequest request,
CancellationToken cancellationToken = default)
{
try
{
return await LaunchCoreAsync(request, cancellationToken).ConfigureAwait(false);
}
catch
{
request.PipeReservation?.Dispose();
throw;
}
}
private async Task<WorkerProcessHandle> LaunchCoreAsync(
WorkerProcessLaunchRequest request,
CancellationToken cancellationToken)
{
ValidateRequest(request);
DateTimeOffset startedAt = _timeProvider.GetUtcNow();
ProcessStartInfo startInfo = CreateStartInfo(request, out WorkerProcessCommandLine commandLine);
IWorkerProcess process;
try
{
process = _processFactory.Start(startInfo);
}
catch (Exception exception) when (exception is not WorkerProcessLaunchException)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.StartFailed,
"Worker process failed to start.",
exception);
}
try
{
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
await CreateStartupProbePipeline(process)
.ExecuteAsync(
async token =>
{
await _startupProbe
.WaitUntilReadyAsync(process, request, token)
.ConfigureAwait(false);
},
startupTimeout.Token)
.ConfigureAwait(false);
return new WorkerProcessHandle(process, commandLine, startedAt);
}
catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested)
{
KillAndDispose(process, "StartupTimeout");
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.StartupTimeout,
"Worker process did not complete startup before the configured timeout.",
exception);
}
catch (OperationCanceledException)
{
KillAndDispose(process, "LaunchCanceled");
throw;
}
catch (Exception exception) when (exception is not WorkerProcessLaunchException)
{
KillAndDispose(process, "StartupFailed");
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.StartupFailed,
"Worker process failed during startup.",
exception);
}
catch (WorkerProcessLaunchException)
{
KillAndDispose(process, "StartupFailed");
throw;
}
}
private ProcessStartInfo CreateStartInfo(
WorkerProcessLaunchRequest request,
out WorkerProcessCommandLine commandLine)
{
string executablePath = ResolveExecutablePath();
string workingDirectory = ResolveWorkingDirectory(executablePath);
string[] arguments =
[
"--session-id",
request.SessionId,
"--pipe-name",
request.PipeName,
"--protocol-version",
request.ProtocolVersion.ToString(System.Globalization.CultureInfo.InvariantCulture),
];
ProcessStartInfo startInfo = new()
{
FileName = executablePath,
WorkingDirectory = workingDirectory,
UseShellExecute = false,
CreateNoWindow = true,
ErrorDialog = false,
};
foreach (string argument in arguments)
{
startInfo.ArgumentList.Add(argument);
}
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
startInfo.Environment[WorkerPipeConnectAttemptTimeoutEnvironmentVariableName] =
_workerOptions.PipeConnectAttemptTimeoutMilliseconds.ToString(System.Globalization.CultureInfo.InvariantCulture);
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
return startInfo;
}
private string ResolveExecutablePath()
{
string executablePath;
try
{
executablePath = Path.GetFullPath(_workerOptions.ExecutablePath);
}
catch (Exception exception) when (exception is ArgumentException or NotSupportedException or PathTooLongException)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidExecutable,
"Worker executable path is not a valid filesystem path.",
exception);
}
if (!string.Equals(Path.GetExtension(executablePath), ".exe", StringComparison.OrdinalIgnoreCase))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidExecutable,
"Worker executable path must point to a .exe file.");
}
if (!File.Exists(executablePath))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.ExecutableNotFound,
"Worker executable does not exist.");
}
WorkerExecutableValidator.Validate(executablePath, _workerOptions.RequiredArchitecture);
return executablePath;
}
private string ResolveWorkingDirectory(string executablePath)
{
if (string.IsNullOrWhiteSpace(_workerOptions.WorkingDirectory))
{
return Path.GetDirectoryName(executablePath) ?? Environment.CurrentDirectory;
}
string workingDirectory;
try
{
workingDirectory = Path.GetFullPath(_workerOptions.WorkingDirectory);
}
catch (Exception exception) when (exception is ArgumentException or NotSupportedException or PathTooLongException)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidWorkingDirectory,
"Worker working directory is not a valid filesystem path.",
exception);
}
if (!Directory.Exists(workingDirectory))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidWorkingDirectory,
"Worker working directory does not exist.");
}
return workingDirectory;
}
private void KillAndDispose(IWorkerProcess process, string reason)
{
try
{
if (!process.HasExited)
{
process.Kill(entireProcessTree: true);
_metrics.WorkerKilled(reason);
}
}
finally
{
process.Dispose();
}
}
private ResiliencePipeline CreateStartupProbePipeline(IWorkerProcess process)
{
RetryStrategyOptions retryOptions = new()
{
MaxRetryAttempts = Math.Max(0, _workerOptions.StartupProbeRetryAttempts - 1),
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(_workerOptions.StartupProbeRetryDelayMilliseconds),
MaxDelay = TimeSpan.FromSeconds(2),
ShouldHandle = new PredicateBuilder().Handle<Exception>(exception =>
ShouldRetryStartupProbe(exception, process)),
OnRetry = args =>
{
_metrics.RetryAttempted("worker_startup");
_logger.LogDebug(
args.Outcome.Exception,
"Retrying worker startup probe after transient failure. Attempt {Attempt}.",
args.AttemptNumber + 1);
return default;
},
};
return new ResiliencePipelineBuilder()
.AddRetry(retryOptions)
.Build();
}
private static bool ShouldRetryStartupProbe(Exception exception, IWorkerProcess process)
{
if (exception is OperationCanceledException or WorkerProcessLaunchException)
{
return false;
}
return !process.HasExited;
}
private static void ValidateRequest(WorkerProcessLaunchRequest request)
{
if (string.IsNullOrWhiteSpace(request.SessionId))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidRequest,
"Worker launch requires a session id.");
}
if (string.IsNullOrWhiteSpace(request.PipeName))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidRequest,
"Worker launch requires a pipe name.");
}
if (request.ProtocolVersion == 0)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidRequest,
"Worker launch requires a non-zero protocol version.");
}
if (string.IsNullOrWhiteSpace(request.Nonce))
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.InvalidRequest,
"Worker launch requires a nonce.");
}
}
}
@@ -0,0 +1,24 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
public sealed class WorkerProcessStartedProbe : IWorkerStartupProbe
{
/// <summary>Verifies that the worker process has started and has not exited.</summary>
/// <param name="process">Worker process to verify.</param>
/// <param name="request">Process launch request.</param>
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
/// <returns>Completed task if process is running.</returns>
public Task WaitUntilReadyAsync(
IWorkerProcess process,
WorkerProcessLaunchRequest request,
CancellationToken cancellationToken)
{
if (process.HasExited)
{
throw new WorkerProcessLaunchException(
WorkerProcessLaunchErrorCode.StartupFailed,
$"Worker process exited before startup completed with exit code {process.ExitCode}.");
}
return Task.CompletedTask;
}
}
@@ -0,0 +1,23 @@
namespace ZB.MOM.WW.MxGateway.Server.Workers;
/// <summary>Service collection extensions for worker process management.</summary>
public static class WorkerServiceCollectionExtensions
{
/// <summary>Registers worker process launcher and factory services.</summary>
/// <param name="services">Service collection to register services.</param>
public static IServiceCollection AddWorkerProcessLauncher(this IServiceCollection services)
{
services.AddSingleton<IWorkerProcessFactory, SystemWorkerProcessFactory>();
services.AddSingleton<IWorkerStartupProbe, WorkerProcessStartedProbe>();
services.AddSingleton<IWorkerProcessLauncher, WorkerProcessLauncher>();
// Terminate workers leaked by a previous unclean gateway run before the
// server accepts sessions. Registered ahead of AddGatewaySessions so the
// cleanup hosted service starts before the session subsystem.
services.AddSingleton<IRunningProcessInspector, SystemRunningProcessInspector>();
services.AddSingleton<OrphanWorkerTerminator>();
services.AddHostedService<OrphanWorkerCleanupHostedService>();
return services;
}
}