rename: prefix gateway projects/namespaces with ZB.MOM.WW + sln→slnx
Apply the ZB.MOM.WW. prefix to all gateway-side projects, folders,
.csproj/.sln contents, C# namespaces, using directives, generated proto
C# (csharp_namespace + checked-in generated files), InternalsVisibleTo
attributes, project-name string literals (LoadProject, .sln lookups,
worker exe paths, staticwebassets manifest), and the install/script/doc
references that point at any of the above. Migrate the solution from
.sln to .slnx via `dotnet sln migrate` and delete the old file.
External-runtime identifiers are intentionally NOT prefixed so external
configuration keeps working:
- GatewayMetrics.cs MeterName ("MxGateway.Server")
- DashboardAuthenticationDefaults Scheme/Policy ("MxGateway.Dashboard")
- GatewayRequestLoggingMiddleware logger category ("MxGateway.Request")
- StaRuntime thread name ("MxGateway.Worker.STA")
- appsettings.json root section "MxGateway" + env-var prefix
MxGateway__... and secret-name MxGateway:ApiKeyPepper
- C:\ProgramData\MxGateway\ data dir paths
Also fixes two tests that were not rename-related but became visible
while validating the rename:
- WorkerLiveMxAccessSmokeTests.ShutDownAsync: cancellation that the
gateway service correctly maps to RpcException(Cancelled) per gRPC
convention was being misclassified as a stream fault. Added a sibling
catch on RpcException with StatusCode.Cancelled.
- IntegrationTestEnvironment.ResolveRepositoryRoot: extracted IsRepositoryRoot
and made it accept either a .git marker OR a .sln/.slnx next to src/
so the worker-exe walker works in non-git working copies.
clients/proto/proto-inputs.json's protoRoot updated to point at
src/ZB.MOM.WW.MxGateway.Contracts/Protos.
Verified by `dotnet build` and a full `dotnet test` of the .slnx with
MXGATEWAY_RUN_LIVE_{MXACCESS,LDAP,GALAXY}_TESTS=1:
Tests: 472/472 pass
Worker.Tests: 280/280 pass (4 dev-rig [Fact(Skip=...)] skipped)
IntegrationTests: 18/18 pass
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction over OS process enumeration and termination. Exists so the
|
||||
/// orphan-worker cleanup logic can be unit-tested without spawning real
|
||||
/// processes.
|
||||
/// </summary>
|
||||
public interface IRunningProcessInspector
|
||||
{
|
||||
/// <summary>
|
||||
/// Enumerates currently running processes whose image name (without the
|
||||
/// <c>.exe</c> extension) matches <paramref name="processName"/>.
|
||||
/// </summary>
|
||||
/// <param name="processName">Process image name to match, without extension.</param>
|
||||
/// <returns>The matching running processes.</returns>
|
||||
IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName);
|
||||
|
||||
/// <summary>Forcibly terminates the process with the given identifier.</summary>
|
||||
/// <param name="processId">Identifier of the process to terminate.</param>
|
||||
void Kill(int processId);
|
||||
}
|
||||
|
||||
/// <summary>Identifying information for a running process candidate.</summary>
|
||||
/// <param name="ProcessId">Operating-system process identifier.</param>
|
||||
/// <param name="ExecutablePath">
|
||||
/// Fully-qualified path to the process main module, or <see langword="null"/>
|
||||
/// when it could not be read (e.g. access denied).
|
||||
/// </param>
|
||||
public sealed record RunningProcessInfo(int ProcessId, string? ExecutablePath);
|
||||
@@ -0,0 +1,45 @@
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>Manages communication with a single worker process via a named pipe.</summary>
|
||||
public interface IWorkerClient : IAsyncDisposable
|
||||
{
|
||||
/// <summary>Unique session identifier for this worker.</summary>
|
||||
string SessionId { get; }
|
||||
|
||||
/// <summary>Process ID of the worker, or null before handshake completes.</summary>
|
||||
int? ProcessId { get; }
|
||||
|
||||
/// <summary>Current state of the worker connection.</summary>
|
||||
WorkerClientState State { get; }
|
||||
|
||||
/// <summary>UTC timestamp of the most recent heartbeat from the worker.</summary>
|
||||
DateTimeOffset LastHeartbeatAt { get; }
|
||||
|
||||
/// <summary>Initiates the handshake and enters ready state.</summary>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
Task StartAsync(CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Sends a command to the worker and waits for a reply.</summary>
|
||||
/// <param name="command">Worker command to invoke.</param>
|
||||
/// <param name="timeout">Timeout for waiting for the reply.</param>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
Task<WorkerCommandReply> InvokeAsync(
|
||||
WorkerCommand command,
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Reads events from the worker as they arrive.</summary>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
IAsyncEnumerable<WorkerEvent> ReadEventsAsync(CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gracefully shuts down the worker by closing the connection.</summary>
|
||||
/// <param name="timeout">Timeout for shutdown.</param>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Terminates the worker process immediately with a diagnostic reason.</summary>
|
||||
/// <param name="reason">Reason for terminating the worker.</param>
|
||||
void Kill(string reason);
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Abstraction over a worker process with lifecycle and exit-code operations.
|
||||
/// </summary>
|
||||
public interface IWorkerProcess : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// The process ID.
|
||||
/// </summary>
|
||||
int Id { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the process has exited.
|
||||
/// </summary>
|
||||
bool HasExited { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The exit code if the process has exited; otherwise null.
|
||||
/// </summary>
|
||||
int? ExitCode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Waits for the process to exit with the specified cancellation token.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
ValueTask WaitForExitAsync(CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Kills the process, optionally terminating the entire process tree.
|
||||
/// </summary>
|
||||
/// <param name="entireProcessTree">If true, terminate all child processes; otherwise terminate only this process.</param>
|
||||
void Kill(bool entireProcessTree);
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>Factory for creating and starting worker processes.</summary>
|
||||
public interface IWorkerProcessFactory
|
||||
{
|
||||
/// <summary>Starts a worker process with the specified start information.</summary>
|
||||
/// <param name="startInfo">Process start configuration.</param>
|
||||
/// <returns>The started worker process.</returns>
|
||||
IWorkerProcess Start(ProcessStartInfo startInfo);
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public interface IWorkerProcessLauncher
|
||||
{
|
||||
/// <summary>Launches a new worker process with the specified configuration.</summary>
|
||||
/// <param name="request">The launch request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The worker process handle.</returns>
|
||||
Task<WorkerProcessHandle> LaunchAsync(
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public interface IWorkerStartupProbe
|
||||
{
|
||||
/// <summary>
|
||||
/// Waits for the worker process to reach a ready state asynchronously.
|
||||
/// </summary>
|
||||
/// <param name="process">Worker process to probe.</param>
|
||||
/// <param name="request">Worker launch request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Completed task.</returns>
|
||||
Task WaitUntilReadyAsync(
|
||||
IWorkerProcess process,
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Hosted service that terminates leftover MXAccess worker processes once on
|
||||
/// gateway startup, before the server begins accepting sessions.
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerCleanupHostedService(
|
||||
OrphanWorkerTerminator terminator,
|
||||
ILogger<OrphanWorkerCleanupHostedService> logger) : IHostedService
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
terminator.TerminateOrphans();
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// Orphan cleanup is best-effort; a failure here must not prevent
|
||||
// the gateway from starting.
|
||||
logger.LogWarning(exception, "Orphan worker cleanup failed on startup.");
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.MxGateway.Server.Configuration;
|
||||
using ZB.MOM.WW.MxGateway.Server.Metrics;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Terminates leftover MXAccess worker processes on gateway startup.
|
||||
/// <para>
|
||||
/// Per <c>gateway.md</c> ("first version should terminate orphaned workers
|
||||
/// on startup") and CLAUDE.md, a gateway restart does not reattach old
|
||||
/// workers. After an unclean gateway crash, x86 worker processes — each
|
||||
/// holding an MXAccess COM instance on an STA — survive indefinitely. This
|
||||
/// terminator finds those processes by executable name/path and kills them
|
||||
/// before the restarted gateway accepts sessions.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public sealed class OrphanWorkerTerminator
|
||||
{
|
||||
private readonly IRunningProcessInspector _inspector;
|
||||
private readonly GatewayMetrics _metrics;
|
||||
private readonly WorkerOptions _workerOptions;
|
||||
private readonly ILogger<OrphanWorkerTerminator> _logger;
|
||||
|
||||
/// <summary>Initializes a new instance of the <see cref="OrphanWorkerTerminator"/> class.</summary>
|
||||
/// <param name="gatewayOptions">Gateway configuration options.</param>
|
||||
/// <param name="inspector">Running-process inspector.</param>
|
||||
/// <param name="metrics">Gateway metrics collector.</param>
|
||||
/// <param name="logger">Optional logger for diagnostic output.</param>
|
||||
public OrphanWorkerTerminator(
|
||||
IOptions<GatewayOptions> gatewayOptions,
|
||||
IRunningProcessInspector inspector,
|
||||
GatewayMetrics metrics,
|
||||
ILogger<OrphanWorkerTerminator>? logger = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||
_inspector = inspector ?? throw new ArgumentNullException(nameof(inspector));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_workerOptions = gatewayOptions.Value.Worker;
|
||||
_logger = logger ?? NullLogger<OrphanWorkerTerminator>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Finds and kills every leftover worker process. Safe to call once at
|
||||
/// startup before any session-owned worker is launched.
|
||||
/// </summary>
|
||||
/// <returns>The number of orphan worker processes that were terminated.</returns>
|
||||
public int TerminateOrphans()
|
||||
{
|
||||
string? configuredPath = ResolveConfiguredExecutablePath();
|
||||
string processName = ResolveProcessName(configuredPath);
|
||||
int currentProcessId = Environment.ProcessId;
|
||||
|
||||
int terminated = 0;
|
||||
foreach (RunningProcessInfo candidate in _inspector.GetProcessesByName(processName))
|
||||
{
|
||||
if (candidate.ProcessId == currentProcessId)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!IsOrphanWorker(candidate, configuredPath))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
_inspector.Kill(candidate.ProcessId);
|
||||
_metrics.WorkerKilled("OrphanStartupCleanup");
|
||||
terminated++;
|
||||
_logger.LogWarning(
|
||||
"Terminated orphan worker process {ProcessId} ({ExecutablePath}) left over from a previous gateway run.",
|
||||
candidate.ProcessId,
|
||||
candidate.ExecutablePath ?? processName);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
// The process may have already exited, or be inaccessible.
|
||||
// A failure to kill one orphan must not block gateway startup.
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Failed to terminate orphan worker process {ProcessId}.",
|
||||
candidate.ProcessId);
|
||||
}
|
||||
}
|
||||
|
||||
if (terminated > 0)
|
||||
{
|
||||
_logger.LogInformation("Terminated {Count} orphan worker process(es) on startup.", terminated);
|
||||
}
|
||||
|
||||
return terminated;
|
||||
}
|
||||
|
||||
private static bool IsOrphanWorker(RunningProcessInfo candidate, string? configuredPath)
|
||||
{
|
||||
// When the executable path is readable, require an exact match against
|
||||
// the configured worker path so unrelated processes that merely share
|
||||
// the image name are never killed.
|
||||
if (candidate.ExecutablePath is { } path)
|
||||
{
|
||||
return configuredPath is not null
|
||||
&& string.Equals(path, configuredPath, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// A null path means the x64 gateway could not introspect the module —
|
||||
// the expected case for the x86 worker. Image-name match is the only
|
||||
// signal available; treat it as an orphan.
|
||||
return true;
|
||||
}
|
||||
|
||||
private string? ResolveConfiguredExecutablePath()
|
||||
{
|
||||
try
|
||||
{
|
||||
return Path.GetFullPath(_workerOptions.ExecutablePath);
|
||||
}
|
||||
catch (Exception exception) when (exception is ArgumentException
|
||||
or NotSupportedException
|
||||
or PathTooLongException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
exception,
|
||||
"Configured worker executable path '{ExecutablePath}' is not a valid filesystem path; "
|
||||
+ "orphan cleanup will match by image name only.",
|
||||
_workerOptions.ExecutablePath);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ResolveProcessName(string? configuredPath)
|
||||
{
|
||||
string source = configuredPath ?? "ZB.MOM.WW.MxGateway.Worker.exe";
|
||||
return Path.GetFileNameWithoutExtension(source);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="IRunningProcessInspector"/> backed by <see cref="Process"/>.
|
||||
/// </summary>
|
||||
public sealed class SystemRunningProcessInspector : IRunningProcessInspector
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<RunningProcessInfo> GetProcessesByName(string processName)
|
||||
{
|
||||
List<RunningProcessInfo> results = [];
|
||||
Process[] processes = Process.GetProcessesByName(processName);
|
||||
try
|
||||
{
|
||||
foreach (Process process in processes)
|
||||
{
|
||||
results.Add(new RunningProcessInfo(process.Id, TryGetExecutablePath(process)));
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
foreach (Process process in processes)
|
||||
{
|
||||
process.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Kill(int processId)
|
||||
{
|
||||
using Process process = Process.GetProcessById(processId);
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
|
||||
private static string? TryGetExecutablePath(Process process)
|
||||
{
|
||||
try
|
||||
{
|
||||
return process.MainModule?.FileName;
|
||||
}
|
||||
catch (Exception exception) when (exception is InvalidOperationException
|
||||
or System.ComponentModel.Win32Exception
|
||||
or NotSupportedException)
|
||||
{
|
||||
// Access to the main module can be denied (e.g. a 64-bit gateway
|
||||
// querying a 32-bit worker, or a process owned by another user).
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Wraps a System.Diagnostics.Process as an IWorkerProcess.
|
||||
/// </summary>
|
||||
internal sealed class SystemWorkerProcess(Process process) : IWorkerProcess
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public int Id => process.Id;
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool HasExited => process.HasExited;
|
||||
|
||||
/// <inheritdoc />
|
||||
public int? ExitCode => process.HasExited ? process.ExitCode : null;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask WaitForExitAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Kill(bool entireProcessTree)
|
||||
{
|
||||
process.Kill(entireProcessTree);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
process.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Factory that creates system processes for workers.
|
||||
/// </summary>
|
||||
public sealed class SystemWorkerProcessFactory : IWorkerProcessFactory
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public IWorkerProcess Start(ProcessStartInfo startInfo)
|
||||
{
|
||||
Process process = new()
|
||||
{
|
||||
StartInfo = startInfo,
|
||||
};
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
process.Dispose();
|
||||
throw new InvalidOperationException("Worker process failed to start.");
|
||||
}
|
||||
|
||||
return new SystemWorkerProcess(process);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public sealed class WorkerClientConnection
|
||||
{
|
||||
/// <summary>Initializes a new worker client connection.</summary>
|
||||
/// <param name="sessionId">Identifier of the session.</param>
|
||||
/// <param name="nonce">Worker handshake nonce.</param>
|
||||
/// <param name="stream">Named pipe stream for IPC communication.</param>
|
||||
/// <param name="frameOptions">Frame protocol serialization options.</param>
|
||||
/// <param name="processHandle">Worker process handle, if available.</param>
|
||||
public WorkerClientConnection(
|
||||
string sessionId,
|
||||
string nonce,
|
||||
Stream stream,
|
||||
WorkerFrameProtocolOptions frameOptions,
|
||||
WorkerProcessHandle? processHandle = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(sessionId))
|
||||
{
|
||||
throw new ArgumentException("Session id is required.", nameof(sessionId));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(nonce))
|
||||
{
|
||||
throw new ArgumentException("Worker nonce is required.", nameof(nonce));
|
||||
}
|
||||
|
||||
SessionId = sessionId;
|
||||
Nonce = nonce;
|
||||
Stream = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
FrameOptions = frameOptions ?? throw new ArgumentNullException(nameof(frameOptions));
|
||||
ProcessHandle = processHandle;
|
||||
}
|
||||
|
||||
/// <summary>The session ID associated with this connection.</summary>
|
||||
public string SessionId { get; }
|
||||
|
||||
/// <summary>The nonce used for handshaking with the worker.</summary>
|
||||
public string Nonce { get; }
|
||||
|
||||
/// <summary>The named pipe stream for IPC communication.</summary>
|
||||
public Stream Stream { get; }
|
||||
|
||||
/// <summary>The frame protocol options for serialization.</summary>
|
||||
public WorkerFrameProtocolOptions FrameOptions { get; }
|
||||
|
||||
/// <summary>The worker process handle, if available.</summary>
|
||||
public WorkerProcessHandle? ProcessHandle { get; }
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public enum WorkerClientErrorCode
|
||||
{
|
||||
InvalidState,
|
||||
ProtocolViolation,
|
||||
PipeDisconnected,
|
||||
CommandTimeout,
|
||||
WorkerFaulted,
|
||||
HeartbeatExpired,
|
||||
ShutdownTimeout,
|
||||
GatewayShutdown,
|
||||
WriteFailed,
|
||||
PendingCommandLimitExceeded,
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Exception raised when communication with a worker process fails.
|
||||
/// </summary>
|
||||
public sealed class WorkerClientException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes with an error code and message.
|
||||
/// </summary>
|
||||
/// <param name="errorCode">Worker client error code classifying the failure.</param>
|
||||
/// <param name="message">Diagnostic message.</param>
|
||||
public WorkerClientException(
|
||||
WorkerClientErrorCode errorCode,
|
||||
string message)
|
||||
: base(message)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes with an error code, message, and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="errorCode">Worker client error code classifying the failure.</param>
|
||||
/// <param name="message">Diagnostic message.</param>
|
||||
/// <param name="innerException">Underlying exception.</param>
|
||||
public WorkerClientException(
|
||||
WorkerClientErrorCode errorCode,
|
||||
string message,
|
||||
Exception innerException)
|
||||
: base(message, innerException)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The worker client error code classifying the failure.
|
||||
/// </summary>
|
||||
public WorkerClientErrorCode ErrorCode { get; }
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>Configurable options for worker client behavior.</summary>
|
||||
public sealed class WorkerClientOptions
|
||||
{
|
||||
/// <summary>Default maximum age of a heartbeat before the client enters faulted state.</summary>
|
||||
public static readonly TimeSpan DefaultHeartbeatGrace = TimeSpan.FromSeconds(15);
|
||||
|
||||
/// <summary>Default interval for checking heartbeat staleness.</summary>
|
||||
public static readonly TimeSpan DefaultHeartbeatCheckInterval = TimeSpan.FromSeconds(1);
|
||||
|
||||
/// <summary>Default timeout when the event queue is full.</summary>
|
||||
public static readonly TimeSpan DefaultEventChannelFullModeTimeout = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// Default ceiling on the in-flight-command heartbeat skip. Mirrors
|
||||
/// <see cref="ZB.MOM.WW.MxGateway.Worker.Ipc.WorkerPipeSessionOptions.DefaultHeartbeatStuckCeiling"/>
|
||||
/// on the worker side (Worker-023). When a command has been in flight
|
||||
/// longer than this, the gateway-side heartbeat watchdog fires
|
||||
/// regardless of pending commands — a truly stuck COM call shouldn't
|
||||
/// hide the worker forever.
|
||||
/// </summary>
|
||||
public static readonly TimeSpan DefaultHeartbeatStuckCeiling = TimeSpan.FromSeconds(75);
|
||||
|
||||
/// <summary>Initializes options with default values.</summary>
|
||||
public WorkerClientOptions()
|
||||
{
|
||||
HeartbeatGrace = DefaultHeartbeatGrace;
|
||||
HeartbeatCheckInterval = DefaultHeartbeatCheckInterval;
|
||||
EventChannelCapacity = 1_024;
|
||||
EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout;
|
||||
MaxPendingCommands = 128;
|
||||
HeartbeatStuckCeiling = DefaultHeartbeatStuckCeiling;
|
||||
}
|
||||
|
||||
/// <summary>Maximum allowed age of the last heartbeat before faulting the client.</summary>
|
||||
public TimeSpan HeartbeatGrace { get; init; }
|
||||
|
||||
/// <summary>Interval at which to check for heartbeat expiration.</summary>
|
||||
public TimeSpan HeartbeatCheckInterval { get; init; }
|
||||
|
||||
/// <summary>Maximum number of events buffered before backpressure is applied.</summary>
|
||||
public int EventChannelCapacity { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time to wait for the gateway-side event channel to drain before
|
||||
/// faulting the worker. Honored by <c>EnqueueWorkerEventAsync</c> via
|
||||
/// <c>WriteAsync</c>; with the channel configured for
|
||||
/// <c>BoundedChannelFullMode.Wait</c>, a transient backlog only faults
|
||||
/// after the configured timeout has elapsed (Server-032). Pre-Server-032
|
||||
/// the field was declared but unused — overflow faulted immediately.
|
||||
/// </summary>
|
||||
public TimeSpan EventChannelFullModeTimeout { get; init; }
|
||||
|
||||
/// <summary>Maximum number of concurrent pending commands.</summary>
|
||||
public int MaxPendingCommands { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Server-031: ceiling on the in-flight-command heartbeat-skip. When
|
||||
/// a command has been pending on the gateway↔worker pipe for longer
|
||||
/// than this, the gateway-side <c>HeartbeatLoopAsync</c> fires the
|
||||
/// <c>HeartbeatExpired</c> fault even if commands are still pending;
|
||||
/// a truly stuck COM call shouldn't keep the watchdog suppressed
|
||||
/// indefinitely. Mirrors Worker-023's <c>HeartbeatStuckCeiling</c> on
|
||||
/// the worker side.
|
||||
/// </summary>
|
||||
public TimeSpan HeartbeatStuckCeiling { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public enum WorkerClientState
|
||||
{
|
||||
Created,
|
||||
Handshaking,
|
||||
Ready,
|
||||
Closing,
|
||||
Closed,
|
||||
Faulted,
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Validates worker envelope messages against protocol expectations.
|
||||
/// </summary>
|
||||
internal static class WorkerEnvelopeValidator
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates a worker envelope for protocol compliance.
|
||||
/// </summary>
|
||||
/// <param name="envelope">The envelope to validate.</param>
|
||||
/// <param name="options">The frame protocol configuration.</param>
|
||||
public static void Validate(
|
||||
WorkerEnvelope envelope,
|
||||
WorkerFrameProtocolOptions options)
|
||||
{
|
||||
if (envelope.ProtocolVersion != options.ProtocolVersion)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.ProtocolVersionMismatch,
|
||||
$"Worker envelope protocol version {envelope.ProtocolVersion} does not match expected version {options.ProtocolVersion}.");
|
||||
}
|
||||
|
||||
if (!string.Equals(envelope.SessionId, options.SessionId, StringComparison.Ordinal))
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.SessionMismatch,
|
||||
"Worker envelope session id does not match the owning gateway session.");
|
||||
}
|
||||
|
||||
if (envelope.BodyCase == WorkerEnvelope.BodyOneofCase.None)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidEnvelope,
|
||||
"Worker envelope must include a typed body.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
using System.Buffers.Binary;
|
||||
using ZB.MOM.WW.MxGateway.Server.Configuration;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
internal static class WorkerExecutableValidator
|
||||
{
|
||||
private const ushort ImageFileMachineI386 = 0x014c;
|
||||
private const ushort ImageFileMachineAmd64 = 0x8664;
|
||||
private const int DosHeaderSignatureOffset = 0;
|
||||
private const int PeHeaderOffsetPointer = 0x3c;
|
||||
private const int PeSignatureSize = 4;
|
||||
private const int MachineOffsetFromPeHeader = PeSignatureSize;
|
||||
private const int MinimumHeaderSize = 0x40;
|
||||
|
||||
/// <summary>Validates that a worker executable file has the required architecture.</summary>
|
||||
/// <param name="executablePath">Full path to the worker executable file.</param>
|
||||
/// <param name="requiredArchitecture">Required CPU architecture (x86 or x64).</param>
|
||||
/// <exception cref="WorkerProcessLaunchException">Thrown if the executable architecture does not match the required architecture.</exception>
|
||||
public static void Validate(
|
||||
string executablePath,
|
||||
WorkerArchitecture requiredArchitecture)
|
||||
{
|
||||
ushort machine = ReadMachineType(executablePath);
|
||||
ushort expectedMachine = requiredArchitecture switch
|
||||
{
|
||||
WorkerArchitecture.X86 => ImageFileMachineI386,
|
||||
WorkerArchitecture.X64 => ImageFileMachineAmd64,
|
||||
_ => throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidExecutable,
|
||||
"Worker executable required architecture is unsupported."),
|
||||
};
|
||||
|
||||
if (machine != expectedMachine)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidExecutable,
|
||||
$"Worker executable architecture does not match required {requiredArchitecture} architecture.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Reads the PE machine type from the executable header.</summary>
|
||||
/// <param name="executablePath">Full path to the executable file.</param>
|
||||
/// <returns>Machine type constant from PE header.</returns>
|
||||
private static ushort ReadMachineType(string executablePath)
|
||||
{
|
||||
byte[] header = new byte[MinimumHeaderSize];
|
||||
using FileStream stream = File.OpenRead(executablePath);
|
||||
if (stream.Read(header) < header.Length)
|
||||
{
|
||||
throw InvalidExecutable("Worker executable is too small to contain a valid PE header.");
|
||||
}
|
||||
|
||||
if (header[DosHeaderSignatureOffset] != 'M' || header[DosHeaderSignatureOffset + 1] != 'Z')
|
||||
{
|
||||
throw InvalidExecutable("Worker executable does not contain an MZ header.");
|
||||
}
|
||||
|
||||
int peHeaderOffset = BinaryPrimitives.ReadInt32LittleEndian(header.AsSpan(PeHeaderOffsetPointer, sizeof(int)));
|
||||
if (peHeaderOffset < MinimumHeaderSize)
|
||||
{
|
||||
throw InvalidExecutable("Worker executable PE header offset is invalid.");
|
||||
}
|
||||
|
||||
byte[] peHeaderBytes = new byte[PeSignatureSize + sizeof(ushort)];
|
||||
stream.Position = peHeaderOffset;
|
||||
if (stream.Read(peHeaderBytes) < peHeaderBytes.Length)
|
||||
{
|
||||
throw InvalidExecutable("Worker executable PE header is missing.");
|
||||
}
|
||||
|
||||
if (peHeaderBytes[0] != 'P' || peHeaderBytes[1] != 'E' || peHeaderBytes[2] != 0 || peHeaderBytes[3] != 0)
|
||||
{
|
||||
throw InvalidExecutable("Worker executable does not contain a PE header.");
|
||||
}
|
||||
|
||||
return BinaryPrimitives.ReadUInt16LittleEndian(
|
||||
peHeaderBytes.AsSpan(MachineOffsetFromPeHeader, sizeof(ushort)));
|
||||
}
|
||||
|
||||
private static WorkerProcessLaunchException InvalidExecutable(string message)
|
||||
{
|
||||
return new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidExecutable,
|
||||
message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public enum WorkerFrameProtocolErrorCode
|
||||
{
|
||||
Unknown = 0,
|
||||
InvalidConfiguration = 1,
|
||||
EndOfStream = 2,
|
||||
MalformedLength = 3,
|
||||
MessageTooLarge = 4,
|
||||
InvalidEnvelope = 5,
|
||||
ProtocolVersionMismatch = 6,
|
||||
SessionMismatch = 7,
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when a worker frame protocol violation occurs.
|
||||
/// </summary>
|
||||
public sealed class WorkerFrameProtocolException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes a frame protocol exception with an error code and message.
|
||||
/// </summary>
|
||||
/// <param name="errorCode">Protocol error code indicating the violation type.</param>
|
||||
/// <param name="message">Human-readable error message.</param>
|
||||
public WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode errorCode,
|
||||
string message)
|
||||
: base(message)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a frame protocol exception with an error code, message, and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="errorCode">Protocol error code indicating the violation type.</param>
|
||||
/// <param name="message">Human-readable error message.</param>
|
||||
/// <param name="innerException">Underlying exception that caused this protocol violation.</param>
|
||||
public WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode errorCode,
|
||||
string message,
|
||||
Exception innerException)
|
||||
: base(message, innerException)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the worker frame protocol error code.
|
||||
/// </summary>
|
||||
public WorkerFrameProtocolErrorCode ErrorCode { get; }
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
using ZB.MOM.WW.MxGateway.Contracts;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the worker frame protocol connection.
|
||||
/// </summary>
|
||||
public sealed class WorkerFrameProtocolOptions
|
||||
{
|
||||
/// <summary>Default maximum message size in bytes (16 MB).</summary>
|
||||
public const int DefaultMaxMessageBytes = 16 * 1024 * 1024;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes worker frame protocol options with a session ID.
|
||||
/// </summary>
|
||||
/// <param name="sessionId">Identifier of the session.</param>
|
||||
public WorkerFrameProtocolOptions(string sessionId)
|
||||
: this(
|
||||
sessionId,
|
||||
GatewayContractInfo.WorkerProtocolVersion,
|
||||
DefaultMaxMessageBytes)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes worker frame protocol options with all parameters.
|
||||
/// </summary>
|
||||
/// <param name="sessionId">Identifier of the session.</param>
|
||||
/// <param name="protocolVersion">Protocol version number.</param>
|
||||
/// <param name="maxMessageBytes">Maximum message size in bytes.</param>
|
||||
public WorkerFrameProtocolOptions(
|
||||
string sessionId,
|
||||
uint protocolVersion,
|
||||
int maxMessageBytes)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(sessionId))
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidConfiguration,
|
||||
"Worker frame protocol requires a session id.");
|
||||
}
|
||||
|
||||
if (protocolVersion == 0)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidConfiguration,
|
||||
"Worker frame protocol requires a non-zero protocol version.");
|
||||
}
|
||||
|
||||
if (maxMessageBytes <= 0)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidConfiguration,
|
||||
"Worker frame protocol max message size must be greater than zero.");
|
||||
}
|
||||
|
||||
SessionId = sessionId;
|
||||
ProtocolVersion = protocolVersion;
|
||||
MaxMessageBytes = maxMessageBytes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the session identifier.
|
||||
/// </summary>
|
||||
public string SessionId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the worker protocol version.
|
||||
/// </summary>
|
||||
public uint ProtocolVersion { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum message size in bytes.
|
||||
/// </summary>
|
||||
public int MaxMessageBytes { get; }
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
using System.Buffers.Binary;
|
||||
using Google.Protobuf;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public sealed class WorkerFrameReader
|
||||
{
|
||||
private readonly WorkerFrameProtocolOptions _options;
|
||||
private readonly Stream _stream;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="WorkerFrameReader"/>.
|
||||
/// </summary>
|
||||
/// <param name="stream">Stream to read frames from.</param>
|
||||
/// <param name="options">Frame protocol options.</param>
|
||||
public WorkerFrameReader(
|
||||
Stream stream,
|
||||
WorkerFrameProtocolOptions options)
|
||||
{
|
||||
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads a worker envelope frame from the stream asynchronously.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Parsed worker envelope.</returns>
|
||||
public async ValueTask<WorkerEnvelope> ReadAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
byte[] lengthPrefix = new byte[sizeof(uint)];
|
||||
await ReadExactlyOrThrowAsync(lengthPrefix, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
uint payloadLength = BinaryPrimitives.ReadUInt32LittleEndian(lengthPrefix);
|
||||
if (payloadLength == 0)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.MalformedLength,
|
||||
"Worker frame payload length must be greater than zero.");
|
||||
}
|
||||
|
||||
if (payloadLength > _options.MaxMessageBytes)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.MessageTooLarge,
|
||||
$"Worker frame payload length {payloadLength} exceeds the configured maximum of {_options.MaxMessageBytes} bytes.");
|
||||
}
|
||||
|
||||
byte[] payload = new byte[payloadLength];
|
||||
await ReadExactlyOrThrowAsync(payload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
WorkerEnvelope envelope;
|
||||
try
|
||||
{
|
||||
envelope = WorkerEnvelope.Parser.ParseFrom(payload);
|
||||
}
|
||||
catch (InvalidProtocolBufferException exception)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidEnvelope,
|
||||
"Worker frame payload is not a valid WorkerEnvelope protobuf message.",
|
||||
exception);
|
||||
}
|
||||
|
||||
WorkerEnvelopeValidator.Validate(envelope, _options);
|
||||
|
||||
return envelope;
|
||||
}
|
||||
|
||||
private async ValueTask ReadExactlyOrThrowAsync(
|
||||
Memory<byte> buffer,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _stream.ReadExactlyAsync(buffer, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (EndOfStreamException exception)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.EndOfStream,
|
||||
"Worker frame ended before the expected number of bytes were read.",
|
||||
exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
using System.Buffers.Binary;
|
||||
using Google.Protobuf;
|
||||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Writes length-prefixed WorkerEnvelope protobuf messages to a stream.
|
||||
/// </summary>
|
||||
public sealed class WorkerFrameWriter
|
||||
{
|
||||
private readonly WorkerFrameProtocolOptions _options;
|
||||
private readonly Stream _stream;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the writer with a stream and frame protocol options.
|
||||
/// </summary>
|
||||
/// <param name="stream">Stream to write frames to.</param>
|
||||
/// <param name="options">Frame protocol configuration.</param>
|
||||
public WorkerFrameWriter(
|
||||
Stream stream,
|
||||
WorkerFrameProtocolOptions options)
|
||||
{
|
||||
_stream = stream ?? throw new ArgumentNullException(nameof(stream));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a WorkerEnvelope as a length-prefixed message to the stream.
|
||||
/// </summary>
|
||||
/// <param name="envelope">Worker envelope message to write.</param>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
public async ValueTask WriteAsync(
|
||||
WorkerEnvelope envelope,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(envelope);
|
||||
WorkerEnvelopeValidator.Validate(envelope, _options);
|
||||
|
||||
int payloadLength = envelope.CalculateSize();
|
||||
if (payloadLength == 0)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.InvalidEnvelope,
|
||||
"Worker envelope cannot serialize to an empty payload.");
|
||||
}
|
||||
|
||||
if (payloadLength > _options.MaxMessageBytes)
|
||||
{
|
||||
throw new WorkerFrameProtocolException(
|
||||
WorkerFrameProtocolErrorCode.MessageTooLarge,
|
||||
$"Worker envelope payload length {payloadLength} exceeds the configured maximum of {_options.MaxMessageBytes} bytes.");
|
||||
}
|
||||
|
||||
byte[] lengthPrefix = new byte[sizeof(uint)];
|
||||
BinaryPrimitives.WriteUInt32LittleEndian(lengthPrefix, (uint)payloadLength);
|
||||
|
||||
await _stream.WriteAsync(lengthPrefix, cancellationToken).ConfigureAwait(false);
|
||||
await _stream.WriteAsync(envelope.ToByteArray(), cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a worker process command line.
|
||||
/// </summary>
|
||||
public sealed class WorkerProcessCommandLine
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes a command line with executable path and arguments.
|
||||
/// </summary>
|
||||
/// <param name="executablePath">Path to the worker executable.</param>
|
||||
/// <param name="arguments">Command-line arguments.</param>
|
||||
public WorkerProcessCommandLine(
|
||||
string executablePath,
|
||||
IReadOnlyList<string> arguments)
|
||||
{
|
||||
ExecutablePath = executablePath;
|
||||
Arguments = arguments;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the path to the worker executable.
|
||||
/// </summary>
|
||||
public string ExecutablePath { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the command-line arguments.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> Arguments { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Join(
|
||||
" ",
|
||||
new[] { Quote(ExecutablePath) }.Concat(Arguments.Select(Quote)));
|
||||
}
|
||||
|
||||
private static string Quote(string value)
|
||||
{
|
||||
return value.Contains(' ', StringComparison.Ordinal)
|
||||
? $"\"{value}\""
|
||||
: value;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>Handle to a running worker process with metadata.</summary>
|
||||
public sealed class WorkerProcessHandle : IDisposable
|
||||
{
|
||||
/// <summary>Initializes a new instance of the WorkerProcessHandle class.</summary>
|
||||
/// <param name="process">The underlying worker process.</param>
|
||||
/// <param name="commandLine">The command line and arguments used to launch the process.</param>
|
||||
/// <param name="launchedAt">The time when the process was launched.</param>
|
||||
public WorkerProcessHandle(
|
||||
IWorkerProcess process,
|
||||
WorkerProcessCommandLine commandLine,
|
||||
DateTimeOffset launchedAt)
|
||||
{
|
||||
Process = process;
|
||||
ProcessId = process.Id;
|
||||
CommandLine = commandLine;
|
||||
LaunchedAt = launchedAt;
|
||||
}
|
||||
|
||||
/// <summary>Gets the underlying worker process.</summary>
|
||||
public IWorkerProcess Process { get; }
|
||||
|
||||
/// <summary>Gets the process ID.</summary>
|
||||
public int ProcessId { get; }
|
||||
|
||||
/// <summary>Gets the command line and arguments used to launch the process.</summary>
|
||||
public WorkerProcessCommandLine CommandLine { get; }
|
||||
|
||||
/// <summary>Gets the time when the process was launched.</summary>
|
||||
public DateTimeOffset LaunchedAt { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
Process.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public enum WorkerProcessLaunchErrorCode
|
||||
{
|
||||
Unknown = 0,
|
||||
InvalidRequest = 1,
|
||||
ExecutableNotFound = 2,
|
||||
InvalidExecutable = 3,
|
||||
InvalidWorkingDirectory = 4,
|
||||
StartFailed = 5,
|
||||
StartupTimeout = 6,
|
||||
StartupFailed = 7,
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public sealed class WorkerProcessLaunchException : Exception
|
||||
{
|
||||
/// <summary>Initializes a new instance of the <see cref="WorkerProcessLaunchException"/> class.</summary>
|
||||
/// <param name="errorCode">Error code for the worker process launch failure.</param>
|
||||
/// <param name="message">Diagnostic message.</param>
|
||||
public WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode errorCode,
|
||||
string message)
|
||||
: base(message)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>Initializes a new instance of the <see cref="WorkerProcessLaunchException"/> class with an inner exception.</summary>
|
||||
/// <param name="errorCode">Error code for the worker process launch failure.</param>
|
||||
/// <param name="message">Diagnostic message.</param>
|
||||
/// <param name="innerException">Underlying exception.</param>
|
||||
public WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode errorCode,
|
||||
string message,
|
||||
Exception innerException)
|
||||
: base(message, innerException)
|
||||
{
|
||||
ErrorCode = errorCode;
|
||||
}
|
||||
|
||||
/// <summary>Gets the error code for the worker process launch failure.</summary>
|
||||
public WorkerProcessLaunchErrorCode ErrorCode { get; }
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public sealed record WorkerProcessLaunchRequest(
|
||||
string SessionId,
|
||||
string PipeName,
|
||||
uint ProtocolVersion,
|
||||
string Nonce,
|
||||
IDisposable? PipeReservation = null);
|
||||
@@ -0,0 +1,336 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.MxGateway.Server.Configuration;
|
||||
using ZB.MOM.WW.MxGateway.Server.Metrics;
|
||||
using Polly;
|
||||
using Polly.Retry;
|
||||
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>
|
||||
/// Launches worker processes with startup probing and error handling.
|
||||
/// </summary>
|
||||
public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
|
||||
{
|
||||
/// <summary>Environment variable for worker nonce.</summary>
|
||||
public const string WorkerNonceEnvironmentVariableName = "MXGATEWAY_WORKER_NONCE";
|
||||
|
||||
/// <summary>Environment variable for worker pipe connect attempt timeout.</summary>
|
||||
public const string WorkerPipeConnectAttemptTimeoutEnvironmentVariableName =
|
||||
"MXGATEWAY_WORKER_PIPE_CONNECT_ATTEMPT_TIMEOUT_MS";
|
||||
|
||||
private readonly IWorkerProcessFactory _processFactory;
|
||||
private readonly IWorkerStartupProbe _startupProbe;
|
||||
private readonly GatewayMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly WorkerOptions _workerOptions;
|
||||
private readonly ILogger<WorkerProcessLauncher> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the worker process launcher with gateway options and dependencies.
|
||||
/// </summary>
|
||||
/// <param name="gatewayOptions">Gateway configuration options.</param>
|
||||
/// <param name="processFactory">Factory for creating worker processes.</param>
|
||||
/// <param name="startupProbe">Probe for checking worker startup completion.</param>
|
||||
/// <param name="metrics">Gateway metrics collector.</param>
|
||||
/// <param name="logger">Optional logger for diagnostic output.</param>
|
||||
/// <param name="timeProvider">Optional time provider for timestamps.</param>
|
||||
public WorkerProcessLauncher(
|
||||
IOptions<GatewayOptions> gatewayOptions,
|
||||
IWorkerProcessFactory processFactory,
|
||||
IWorkerStartupProbe startupProbe,
|
||||
GatewayMetrics metrics,
|
||||
ILogger<WorkerProcessLauncher>? logger = null,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(gatewayOptions);
|
||||
ArgumentNullException.ThrowIfNull(processFactory);
|
||||
ArgumentNullException.ThrowIfNull(startupProbe);
|
||||
ArgumentNullException.ThrowIfNull(metrics);
|
||||
|
||||
_workerOptions = gatewayOptions.Value.Worker;
|
||||
_processFactory = processFactory;
|
||||
_startupProbe = startupProbe;
|
||||
_metrics = metrics;
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? NullLogger<WorkerProcessLauncher>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Launches a worker process and waits for startup.
|
||||
/// </summary>
|
||||
/// <param name="request">Request payload.</param>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
/// <returns>Handle to the launched worker process.</returns>
|
||||
public async Task<WorkerProcessHandle> LaunchAsync(
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await LaunchCoreAsync(request, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch
|
||||
{
|
||||
request.PipeReservation?.Dispose();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<WorkerProcessHandle> LaunchCoreAsync(
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ValidateRequest(request);
|
||||
|
||||
DateTimeOffset startedAt = _timeProvider.GetUtcNow();
|
||||
ProcessStartInfo startInfo = CreateStartInfo(request, out WorkerProcessCommandLine commandLine);
|
||||
|
||||
IWorkerProcess process;
|
||||
try
|
||||
{
|
||||
process = _processFactory.Start(startInfo);
|
||||
}
|
||||
catch (Exception exception) when (exception is not WorkerProcessLaunchException)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.StartFailed,
|
||||
"Worker process failed to start.",
|
||||
exception);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using CancellationTokenSource startupTimeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
startupTimeout.CancelAfter(TimeSpan.FromSeconds(_workerOptions.StartupTimeoutSeconds));
|
||||
|
||||
await CreateStartupProbePipeline(process)
|
||||
.ExecuteAsync(
|
||||
async token =>
|
||||
{
|
||||
await _startupProbe
|
||||
.WaitUntilReadyAsync(process, request, token)
|
||||
.ConfigureAwait(false);
|
||||
},
|
||||
startupTimeout.Token)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return new WorkerProcessHandle(process, commandLine, startedAt);
|
||||
}
|
||||
catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
KillAndDispose(process, "StartupTimeout");
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.StartupTimeout,
|
||||
"Worker process did not complete startup before the configured timeout.",
|
||||
exception);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
KillAndDispose(process, "LaunchCanceled");
|
||||
throw;
|
||||
}
|
||||
catch (Exception exception) when (exception is not WorkerProcessLaunchException)
|
||||
{
|
||||
KillAndDispose(process, "StartupFailed");
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.StartupFailed,
|
||||
"Worker process failed during startup.",
|
||||
exception);
|
||||
}
|
||||
catch (WorkerProcessLaunchException)
|
||||
{
|
||||
KillAndDispose(process, "StartupFailed");
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private ProcessStartInfo CreateStartInfo(
|
||||
WorkerProcessLaunchRequest request,
|
||||
out WorkerProcessCommandLine commandLine)
|
||||
{
|
||||
string executablePath = ResolveExecutablePath();
|
||||
string workingDirectory = ResolveWorkingDirectory(executablePath);
|
||||
string[] arguments =
|
||||
[
|
||||
"--session-id",
|
||||
request.SessionId,
|
||||
"--pipe-name",
|
||||
request.PipeName,
|
||||
"--protocol-version",
|
||||
request.ProtocolVersion.ToString(System.Globalization.CultureInfo.InvariantCulture),
|
||||
];
|
||||
|
||||
ProcessStartInfo startInfo = new()
|
||||
{
|
||||
FileName = executablePath,
|
||||
WorkingDirectory = workingDirectory,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
ErrorDialog = false,
|
||||
};
|
||||
|
||||
foreach (string argument in arguments)
|
||||
{
|
||||
startInfo.ArgumentList.Add(argument);
|
||||
}
|
||||
|
||||
startInfo.Environment[WorkerNonceEnvironmentVariableName] = request.Nonce;
|
||||
startInfo.Environment[WorkerPipeConnectAttemptTimeoutEnvironmentVariableName] =
|
||||
_workerOptions.PipeConnectAttemptTimeoutMilliseconds.ToString(System.Globalization.CultureInfo.InvariantCulture);
|
||||
|
||||
commandLine = new WorkerProcessCommandLine(executablePath, arguments);
|
||||
|
||||
return startInfo;
|
||||
}
|
||||
|
||||
private string ResolveExecutablePath()
|
||||
{
|
||||
string executablePath;
|
||||
try
|
||||
{
|
||||
executablePath = Path.GetFullPath(_workerOptions.ExecutablePath);
|
||||
}
|
||||
catch (Exception exception) when (exception is ArgumentException or NotSupportedException or PathTooLongException)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidExecutable,
|
||||
"Worker executable path is not a valid filesystem path.",
|
||||
exception);
|
||||
}
|
||||
|
||||
if (!string.Equals(Path.GetExtension(executablePath), ".exe", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidExecutable,
|
||||
"Worker executable path must point to a .exe file.");
|
||||
}
|
||||
|
||||
if (!File.Exists(executablePath))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.ExecutableNotFound,
|
||||
"Worker executable does not exist.");
|
||||
}
|
||||
|
||||
WorkerExecutableValidator.Validate(executablePath, _workerOptions.RequiredArchitecture);
|
||||
|
||||
return executablePath;
|
||||
}
|
||||
|
||||
private string ResolveWorkingDirectory(string executablePath)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(_workerOptions.WorkingDirectory))
|
||||
{
|
||||
return Path.GetDirectoryName(executablePath) ?? Environment.CurrentDirectory;
|
||||
}
|
||||
|
||||
string workingDirectory;
|
||||
try
|
||||
{
|
||||
workingDirectory = Path.GetFullPath(_workerOptions.WorkingDirectory);
|
||||
}
|
||||
catch (Exception exception) when (exception is ArgumentException or NotSupportedException or PathTooLongException)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidWorkingDirectory,
|
||||
"Worker working directory is not a valid filesystem path.",
|
||||
exception);
|
||||
}
|
||||
|
||||
if (!Directory.Exists(workingDirectory))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidWorkingDirectory,
|
||||
"Worker working directory does not exist.");
|
||||
}
|
||||
|
||||
return workingDirectory;
|
||||
}
|
||||
|
||||
private void KillAndDispose(IWorkerProcess process, string reason)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (!process.HasExited)
|
||||
{
|
||||
process.Kill(entireProcessTree: true);
|
||||
_metrics.WorkerKilled(reason);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
process.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private ResiliencePipeline CreateStartupProbePipeline(IWorkerProcess process)
|
||||
{
|
||||
RetryStrategyOptions retryOptions = new()
|
||||
{
|
||||
MaxRetryAttempts = Math.Max(0, _workerOptions.StartupProbeRetryAttempts - 1),
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
UseJitter = true,
|
||||
Delay = TimeSpan.FromMilliseconds(_workerOptions.StartupProbeRetryDelayMilliseconds),
|
||||
MaxDelay = TimeSpan.FromSeconds(2),
|
||||
ShouldHandle = new PredicateBuilder().Handle<Exception>(exception =>
|
||||
ShouldRetryStartupProbe(exception, process)),
|
||||
OnRetry = args =>
|
||||
{
|
||||
_metrics.RetryAttempted("worker_startup");
|
||||
_logger.LogDebug(
|
||||
args.Outcome.Exception,
|
||||
"Retrying worker startup probe after transient failure. Attempt {Attempt}.",
|
||||
args.AttemptNumber + 1);
|
||||
return default;
|
||||
},
|
||||
};
|
||||
|
||||
return new ResiliencePipelineBuilder()
|
||||
.AddRetry(retryOptions)
|
||||
.Build();
|
||||
}
|
||||
|
||||
private static bool ShouldRetryStartupProbe(Exception exception, IWorkerProcess process)
|
||||
{
|
||||
if (exception is OperationCanceledException or WorkerProcessLaunchException)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return !process.HasExited;
|
||||
}
|
||||
|
||||
private static void ValidateRequest(WorkerProcessLaunchRequest request)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(request.SessionId))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidRequest,
|
||||
"Worker launch requires a session id.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.PipeName))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidRequest,
|
||||
"Worker launch requires a pipe name.");
|
||||
}
|
||||
|
||||
if (request.ProtocolVersion == 0)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidRequest,
|
||||
"Worker launch requires a non-zero protocol version.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.Nonce))
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.InvalidRequest,
|
||||
"Worker launch requires a nonce.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
public sealed class WorkerProcessStartedProbe : IWorkerStartupProbe
|
||||
{
|
||||
/// <summary>Verifies that the worker process has started and has not exited.</summary>
|
||||
/// <param name="process">Worker process to verify.</param>
|
||||
/// <param name="request">Process launch request.</param>
|
||||
/// <param name="cancellationToken">Token to cancel the asynchronous operation.</param>
|
||||
/// <returns>Completed task if process is running.</returns>
|
||||
public Task WaitUntilReadyAsync(
|
||||
IWorkerProcess process,
|
||||
WorkerProcessLaunchRequest request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (process.HasExited)
|
||||
{
|
||||
throw new WorkerProcessLaunchException(
|
||||
WorkerProcessLaunchErrorCode.StartupFailed,
|
||||
$"Worker process exited before startup completed with exit code {process.ExitCode}.");
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ZB.MOM.WW.MxGateway.Server.Workers;
|
||||
|
||||
/// <summary>Service collection extensions for worker process management.</summary>
|
||||
public static class WorkerServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>Registers worker process launcher and factory services.</summary>
|
||||
/// <param name="services">Service collection to register services.</param>
|
||||
public static IServiceCollection AddWorkerProcessLauncher(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<IWorkerProcessFactory, SystemWorkerProcessFactory>();
|
||||
services.AddSingleton<IWorkerStartupProbe, WorkerProcessStartedProbe>();
|
||||
services.AddSingleton<IWorkerProcessLauncher, WorkerProcessLauncher>();
|
||||
|
||||
// Terminate workers leaked by a previous unclean gateway run before the
|
||||
// server accepts sessions. Registered ahead of AddGatewaySessions so the
|
||||
// cleanup hosted service starts before the session subsystem.
|
||||
services.AddSingleton<IRunningProcessInspector, SystemRunningProcessInspector>();
|
||||
services.AddSingleton<OrphanWorkerTerminator>();
|
||||
services.AddHostedService<OrphanWorkerCleanupHostedService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user