Improve gateway reliability and dashboard docs

This commit is contained in:
Joseph Doherty
2026-04-28 00:13:22 -04:00
parent bd4a09a35e
commit 4fc355b357
61 changed files with 1722 additions and 150 deletions
@@ -8,6 +8,8 @@ namespace MxGateway.Client.Cli;
public static class MxGatewayClientCli public static class MxGatewayClientCli
{ {
private const uint MaxAggregateEvents = 10_000;
private static readonly JsonFormatter ProtobufJsonFormatter = JsonFormatter.Default; private static readonly JsonFormatter ProtobufJsonFormatter = JsonFormatter.Default;
private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web); private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web);
@@ -342,8 +344,22 @@ public static class MxGatewayClientCli
TextWriter output, TextWriter output,
CancellationToken cancellationToken) CancellationToken cancellationToken)
{ {
var events = new List<MxEvent>();
uint maxEvents = arguments.GetUInt32("max-events", 0); uint maxEvents = arguments.GetUInt32("max-events", 0);
bool json = arguments.HasFlag("json");
bool jsonLines = arguments.HasFlag("jsonl");
if (json && !jsonLines && maxEvents is 0)
{
throw new ArgumentException("--json stream-events requires --max-events to bound aggregate output.");
}
if (maxEvents > MaxAggregateEvents)
{
throw new ArgumentException($"--max-events cannot exceed {MaxAggregateEvents}.");
}
var events = json && !jsonLines
? new List<MxEvent>(checked((int)maxEvents))
: [];
uint eventCount = 0; uint eventCount = 0;
var request = new StreamEventsRequest var request = new StreamEventsRequest
{ {
@@ -355,7 +371,11 @@ public static class MxGatewayClientCli
.WithCancellation(cancellationToken) .WithCancellation(cancellationToken)
.ConfigureAwait(false)) .ConfigureAwait(false))
{ {
if (arguments.HasFlag("json")) if (jsonLines)
{
output.WriteLine(ProtobufJsonFormatter.Format(gatewayEvent));
}
else if (json)
{ {
events.Add(gatewayEvent); events.Add(gatewayEvent);
} }
@@ -371,7 +391,7 @@ public static class MxGatewayClientCli
} }
} }
if (arguments.HasFlag("json")) if (json && !jsonLines)
{ {
output.WriteLine(JsonSerializer.Serialize( output.WriteLine(JsonSerializer.Serialize(
new { events = events.Select(EventToJsonElement).ToArray() }, new { events = events.Select(EventToJsonElement).ToArray() },
@@ -25,7 +25,7 @@ internal sealed class GrpcMxGatewayClientTransport(
} }
catch (RpcException exception) catch (RpcException exception)
{ {
throw MapRpcException(exception); throw MapRpcException(exception, callOptions.CancellationToken);
} }
} }
@@ -41,7 +41,7 @@ internal sealed class GrpcMxGatewayClientTransport(
} }
catch (RpcException exception) catch (RpcException exception)
{ {
throw MapRpcException(exception); throw MapRpcException(exception, callOptions.CancellationToken);
} }
} }
@@ -57,7 +57,7 @@ internal sealed class GrpcMxGatewayClientTransport(
} }
catch (RpcException exception) catch (RpcException exception)
{ {
throw MapRpcException(exception); throw MapRpcException(exception, callOptions.CancellationToken);
} }
} }
@@ -87,7 +87,7 @@ internal sealed class GrpcMxGatewayClientTransport(
} }
catch (RpcException exception) catch (RpcException exception)
{ {
throw MapRpcException(exception); throw MapRpcException(exception, effectiveCancellationToken);
} }
yield return gatewayEvent; yield return gatewayEvent;
@@ -101,8 +101,18 @@ internal sealed class GrpcMxGatewayClientTransport(
return StreamEventsAsync(request, callOptions); return StreamEventsAsync(request, callOptions);
} }
private static MxGatewayException MapRpcException(RpcException exception) private static Exception MapRpcException(
RpcException exception,
CancellationToken cancellationToken)
{ {
if (cancellationToken.IsCancellationRequested || exception.StatusCode == StatusCode.Cancelled)
{
return new OperationCanceledException(
exception.Status.Detail,
exception,
cancellationToken);
}
return exception.StatusCode switch return exception.StatusCode switch
{ {
StatusCode.Unauthenticated => new MxGatewayAuthenticationException( StatusCode.Unauthenticated => new MxGatewayAuthenticationException(
@@ -3,6 +3,9 @@ using Grpc.Net.Client;
using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging;
using MxGateway.Contracts.Proto; using MxGateway.Contracts.Proto;
using Polly; using Polly;
using System.Net.Http;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
namespace MxGateway.Client; namespace MxGateway.Client;
@@ -54,10 +57,12 @@ public sealed class MxGatewayClient : IAsyncDisposable
ArgumentNullException.ThrowIfNull(options); ArgumentNullException.ThrowIfNull(options);
options.Validate(); options.Validate();
HttpMessageHandler handler = CreateHttpHandler(options);
var channel = GrpcChannel.ForAddress( var channel = GrpcChannel.ForAddress(
options.Endpoint, options.Endpoint,
new GrpcChannelOptions new GrpcChannelOptions
{ {
HttpHandler = handler,
LoggerFactory = options.LoggerFactory, LoggerFactory = options.LoggerFactory,
}); });
@@ -126,7 +131,7 @@ public sealed class MxGatewayClient : IAsyncDisposable
ArgumentNullException.ThrowIfNull(request); ArgumentNullException.ThrowIfNull(request);
ThrowIfDisposed(); ThrowIfDisposed();
return _transport.StreamEventsAsync(request, CreateCallOptions(cancellationToken)); return _transport.StreamEventsAsync(request, CreateStreamCallOptions(cancellationToken));
} }
public ValueTask DisposeAsync() public ValueTask DisposeAsync()
@@ -142,6 +147,18 @@ public sealed class MxGatewayClient : IAsyncDisposable
} }
internal CallOptions CreateCallOptions(CancellationToken cancellationToken) internal CallOptions CreateCallOptions(CancellationToken cancellationToken)
{
return CreateCallOptions(cancellationToken, Options.DefaultCallTimeout);
}
internal CallOptions CreateStreamCallOptions(CancellationToken cancellationToken)
{
return CreateCallOptions(cancellationToken, Options.StreamTimeout);
}
internal CallOptions CreateCallOptions(
CancellationToken cancellationToken,
TimeSpan? timeout)
{ {
Metadata headers = new() Metadata headers = new()
{ {
@@ -150,18 +167,61 @@ public sealed class MxGatewayClient : IAsyncDisposable
return new CallOptions( return new CallOptions(
headers, headers,
DateTime.UtcNow.Add(Options.DefaultCallTimeout), timeout is null ? null : DateTime.UtcNow.Add(timeout.Value),
cancellationToken); cancellationToken);
} }
private Task<T> ExecuteSafeUnaryAsync<T>( private async Task<T> ExecuteSafeUnaryAsync<T>(
Func<CancellationToken, Task<T>> call, Func<CancellationToken, Task<T>> call,
CancellationToken cancellationToken) CancellationToken cancellationToken)
{ {
return _safeUnaryRetryPipeline.ExecuteAsync( using CancellationTokenSource timeout = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
timeout.CancelAfter(Options.DefaultCallTimeout);
return await _safeUnaryRetryPipeline.ExecuteAsync(
async token => await call(token).ConfigureAwait(false), async token => await call(token).ConfigureAwait(false),
cancellationToken) timeout.Token)
.AsTask(); .ConfigureAwait(false);
}
private static HttpMessageHandler CreateHttpHandler(MxGatewayClientOptions options)
{
SocketsHttpHandler handler = new()
{
ConnectTimeout = options.ConnectTimeout,
};
if (options.UseTls)
{
handler.SslOptions = new SslClientAuthenticationOptions();
if (!string.IsNullOrWhiteSpace(options.ServerNameOverride))
{
handler.SslOptions.TargetHost = options.ServerNameOverride;
}
if (!string.IsNullOrWhiteSpace(options.CaCertificatePath))
{
X509Certificate2 trustedRoot = X509CertificateLoader.LoadCertificateFromFile(options.CaCertificatePath);
handler.SslOptions.RemoteCertificateValidationCallback = (_, certificate, chain, errors) =>
{
if (certificate is null)
{
return false;
}
using X509Chain customChain = new();
customChain.ChainPolicy.TrustMode = X509ChainTrustMode.CustomRootTrust;
customChain.ChainPolicy.CustomTrustStore.Add(trustedRoot);
customChain.ChainPolicy.RevocationMode = X509RevocationMode.NoCheck;
customChain.ChainPolicy.VerificationFlags = X509VerificationFlags.NoFlag;
X509Certificate2 certificateToValidate = certificate as X509Certificate2
?? X509CertificateLoader.LoadCertificate(certificate.Export(X509ContentType.Cert));
return customChain.Build(certificateToValidate);
};
}
}
return handler;
} }
private void ThrowIfDisposed() private void ThrowIfDisposed()
@@ -21,6 +21,8 @@ public sealed class MxGatewayClientOptions
public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30); public TimeSpan DefaultCallTimeout { get; init; } = TimeSpan.FromSeconds(30);
public TimeSpan? StreamTimeout { get; init; }
public MxGatewayClientRetryOptions Retry { get; init; } = new(); public MxGatewayClientRetryOptions Retry { get; init; } = new();
public ILoggerFactory? LoggerFactory { get; init; } public ILoggerFactory? LoggerFactory { get; init; }
@@ -57,6 +59,27 @@ public sealed class MxGatewayClientOptions
"The default call timeout must be greater than zero."); "The default call timeout must be greater than zero.");
} }
if (StreamTimeout is not null && StreamTimeout <= TimeSpan.Zero)
{
throw new ArgumentOutOfRangeException(
nameof(StreamTimeout),
"The stream timeout must be greater than zero when configured.");
}
if (UseTls && Endpoint.Scheme != Uri.UriSchemeHttps)
{
throw new ArgumentException(
"UseTls requires an https gateway endpoint.",
nameof(Endpoint));
}
if (!UseTls && Endpoint.Scheme == Uri.UriSchemeHttps)
{
throw new ArgumentException(
"An https gateway endpoint requires UseTls.",
nameof(Endpoint));
}
Retry.Validate(); Retry.Validate();
} }
} }
+23 -3
View File
@@ -377,17 +377,19 @@ func runSmoke(ctx context.Context, args []string, stdout, stderr io.Writer) erro
if err != nil { if err != nil {
return err return err
} }
defer session.Close(context.Background())
serverHandle, err := session.Register(ctx, *clientName) serverHandle, err := session.Register(ctx, *clientName)
if err != nil { if err != nil {
return err return closeSmokeSession(ctx, session, err)
} }
itemHandle, err := session.AddItem(ctx, serverHandle, *item) itemHandle, err := session.AddItem(ctx, serverHandle, *item)
if err != nil { if err != nil {
return err return closeSmokeSession(ctx, session, err)
} }
if err := session.Advise(ctx, serverHandle, itemHandle); err != nil { if err := session.Advise(ctx, serverHandle, itemHandle); err != nil {
return closeSmokeSession(ctx, session, err)
}
if err := closeSmokeSession(ctx, session, nil); err != nil {
return err return err
} }
@@ -406,6 +408,24 @@ func runSmoke(ctx context.Context, args []string, stdout, stderr io.Writer) erro
return nil return nil
} }
func closeSmokeSession(ctx context.Context, session *mxgateway.Session, primaryErr error) error {
closeCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if deadline, ok := ctx.Deadline(); ok {
if until := time.Until(deadline); until > 0 && until < 5*time.Second {
cancel()
closeCtx, cancel = context.WithTimeout(context.Background(), until)
defer cancel()
}
}
_, closeErr := session.Close(closeCtx)
if primaryErr != nil {
return primaryErr
}
return closeErr
}
func bindCommonFlags(flags *flag.FlagSet) *commonOptions { func bindCommonFlags(flags *flag.FlagSet) *commonOptions {
common := &commonOptions{} common := &commonOptions{}
flags.StringVar(&common.Endpoint, "endpoint", "localhost:5000", "gateway endpoint") flags.StringVar(&common.Endpoint, "endpoint", "localhost:5000", "gateway endpoint")
+5 -2
View File
@@ -184,8 +184,11 @@ func (c *Client) callContext(ctx context.Context) (context.Context, context.Canc
if timeout < 0 { if timeout < 0 {
return ctx, func() {} return ctx, func() {}
} }
if _, ok := ctx.Deadline(); ok { if deadline, ok := ctx.Deadline(); ok {
return ctx, func() {} timeoutDeadline := time.Now().Add(timeout)
if deadline.Before(timeoutDeadline) {
return ctx, func() {}
}
} }
return context.WithTimeout(ctx, timeout) return context.WithTimeout(ctx, timeout)
} }
+41 -2
View File
@@ -5,6 +5,7 @@ import (
"crypto/rand" "crypto/rand"
"encoding/hex" "encoding/hex"
"errors" "errors"
"fmt"
"io" "io"
"sync" "sync"
@@ -13,6 +14,8 @@ import (
"google.golang.org/grpc/status" "google.golang.org/grpc/status"
) )
const maxBulkItems = 1000
// EventResult carries either the next ordered event or a terminal stream error. // EventResult carries either the next ordered event or a terminal stream error.
type EventResult struct { type EventResult struct {
Event *MxEvent Event *MxEvent
@@ -225,6 +228,9 @@ func (s *Session) AddItemBulk(ctx context.Context, serverHandle int32, tagAddres
if tagAddresses == nil { if tagAddresses == nil {
return nil, errors.New("mxgateway: tag addresses are required") return nil, errors.New("mxgateway: tag addresses are required")
} }
if err := ensureBulkSize("tag addresses", len(tagAddresses)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_ADD_ITEM_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_ADD_ITEM_BULK,
Payload: &pb.MxCommand_AddItemBulk{ Payload: &pb.MxCommand_AddItemBulk{
@@ -245,6 +251,9 @@ func (s *Session) AdviseItemBulk(ctx context.Context, serverHandle int32, itemHa
if itemHandles == nil { if itemHandles == nil {
return nil, errors.New("mxgateway: item handles are required") return nil, errors.New("mxgateway: item handles are required")
} }
if err := ensureBulkSize("item handles", len(itemHandles)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_ADVISE_ITEM_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_ADVISE_ITEM_BULK,
Payload: &pb.MxCommand_AdviseItemBulk{ Payload: &pb.MxCommand_AdviseItemBulk{
@@ -265,6 +274,9 @@ func (s *Session) RemoveItemBulk(ctx context.Context, serverHandle int32, itemHa
if itemHandles == nil { if itemHandles == nil {
return nil, errors.New("mxgateway: item handles are required") return nil, errors.New("mxgateway: item handles are required")
} }
if err := ensureBulkSize("item handles", len(itemHandles)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_REMOVE_ITEM_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_REMOVE_ITEM_BULK,
Payload: &pb.MxCommand_RemoveItemBulk{ Payload: &pb.MxCommand_RemoveItemBulk{
@@ -285,6 +297,9 @@ func (s *Session) UnAdviseItemBulk(ctx context.Context, serverHandle int32, item
if itemHandles == nil { if itemHandles == nil {
return nil, errors.New("mxgateway: item handles are required") return nil, errors.New("mxgateway: item handles are required")
} }
if err := ensureBulkSize("item handles", len(itemHandles)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_UN_ADVISE_ITEM_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_UN_ADVISE_ITEM_BULK,
Payload: &pb.MxCommand_UnAdviseItemBulk{ Payload: &pb.MxCommand_UnAdviseItemBulk{
@@ -305,6 +320,9 @@ func (s *Session) SubscribeBulk(ctx context.Context, serverHandle int32, tagAddr
if tagAddresses == nil { if tagAddresses == nil {
return nil, errors.New("mxgateway: tag addresses are required") return nil, errors.New("mxgateway: tag addresses are required")
} }
if err := ensureBulkSize("tag addresses", len(tagAddresses)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_SUBSCRIBE_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_SUBSCRIBE_BULK,
Payload: &pb.MxCommand_SubscribeBulk{ Payload: &pb.MxCommand_SubscribeBulk{
@@ -325,6 +343,9 @@ func (s *Session) UnsubscribeBulk(ctx context.Context, serverHandle int32, itemH
if itemHandles == nil { if itemHandles == nil {
return nil, errors.New("mxgateway: item handles are required") return nil, errors.New("mxgateway: item handles are required")
} }
if err := ensureBulkSize("item handles", len(itemHandles)); err != nil {
return nil, err
}
reply, err := s.invokeCommand(ctx, &pb.MxCommand{ reply, err := s.invokeCommand(ctx, &pb.MxCommand{
Kind: pb.MxCommandKind_MX_COMMAND_KIND_UNSUBSCRIBE_BULK, Kind: pb.MxCommandKind_MX_COMMAND_KIND_UNSUBSCRIBE_BULK,
Payload: &pb.MxCommand_UnsubscribeBulk{ Payload: &pb.MxCommand_UnsubscribeBulk{
@@ -387,13 +408,15 @@ func (s *Session) EventsAfter(ctx context.Context, afterWorkerSequence uint64) (
for { for {
event, err := stream.Recv() event, err := stream.Recv()
if err == nil { if err == nil {
results <- EventResult{Event: event} if !sendEventResult(ctx, results, EventResult{Event: event}) {
return
}
continue continue
} }
if err == io.EOF || status.Code(err) == codes.Canceled || ctx.Err() != nil { if err == io.EOF || status.Code(err) == codes.Canceled || ctx.Err() != nil {
return return
} }
results <- EventResult{Err: &GatewayError{Op: "stream events", Err: err}} sendEventResult(ctx, results, EventResult{Err: &GatewayError{Op: "stream events", Err: err}})
return return
} }
}() }()
@@ -401,6 +424,22 @@ func (s *Session) EventsAfter(ctx context.Context, afterWorkerSequence uint64) (
return results, nil return results, nil
} }
func ensureBulkSize(name string, length int) error {
if length > maxBulkItems {
return fmt.Errorf("mxgateway: %s bulk commands are limited to %d item(s)", name, maxBulkItems)
}
return nil
}
func sendEventResult(ctx context.Context, results chan<- EventResult, result EventResult) bool {
select {
case results <- result:
return true
case <-ctx.Done():
return false
}
}
func (s *Session) invokeCommand(ctx context.Context, command *MxCommand) (*MxCommandReply, error) { func (s *Session) invokeCommand(ctx context.Context, command *MxCommand) (*MxCommandReply, error) {
return s.client.Invoke(ctx, &pb.MxCommandRequest{ return s.client.Invoke(ctx, &pb.MxCommandRequest{
SessionId: s.ID(), SessionId: s.ID(),
@@ -334,25 +334,28 @@ public final class MxGatewayCli implements Callable<Integer> {
var session = client.openSession(OpenSessionRequest.newBuilder() var session = client.openSession(OpenSessionRequest.newBuilder()
.setClientSessionName(clientName) .setClientSessionName(clientName)
.build()); .build());
MxGatewayCliSession cliSession = client.session(session.getSessionId()); try {
int serverHandle = cliSession.register(clientName); MxGatewayCliSession cliSession = client.session(session.getSessionId());
int itemHandle = cliSession.addItem(serverHandle, item); int serverHandle = cliSession.register(clientName);
cliSession.advise(serverHandle, itemHandle); int itemHandle = cliSession.addItem(serverHandle, item);
if (json) { cliSession.advise(serverHandle, itemHandle);
Map<String, Object> output = new LinkedHashMap<>(); if (json) {
output.put("command", "smoke"); Map<String, Object> output = new LinkedHashMap<>();
output.put("options", common.redactedJsonMap()); output.put("command", "smoke");
output.put("sessionId", session.getSessionId()); output.put("options", common.redactedJsonMap());
output.put("serverHandle", serverHandle); output.put("sessionId", session.getSessionId());
output.put("itemHandle", itemHandle); output.put("serverHandle", serverHandle);
client.out().println(jsonObject(output)); output.put("itemHandle", itemHandle);
} else { client.out().println(jsonObject(output));
client.out().printf( } else {
"session=%s server=%d item=%d%n", session.getSessionId(), serverHandle, itemHandle); client.out().printf(
"session=%s server=%d item=%d%n", session.getSessionId(), serverHandle, itemHandle);
}
} finally {
client.closeSession(CloseSessionRequest.newBuilder()
.setSessionId(session.getSessionId())
.build());
} }
client.closeSession(CloseSessionRequest.newBuilder()
.setSessionId(session.getSessionId())
.build());
} }
return 0; return 0;
} }
@@ -105,13 +105,20 @@ public final class MxEventStream implements Iterator<MxEvent>, AutoCloseable {
private void offer(Object value) { private void offer(Object value) {
Objects.requireNonNull(value, "value"); Objects.requireNonNull(value, "value");
if (value == END) { if (value == END) {
queue.offer(value); if (!queue.offer(value)) {
queue.clear();
queue.offer(value);
}
return; return;
} }
try { if (!queue.offer(value)) {
queue.put(value); ClientCallStreamObserver<StreamEventsRequest> stream = requestStream;
} catch (InterruptedException error) { if (stream != null) {
Thread.currentThread().interrupt(); stream.cancel("client event stream queue overflowed", null);
}
queue.clear();
queue.offer(new MxGatewayException("gateway stream events queue overflowed"));
queue.offer(END);
} }
} }
} }
@@ -63,7 +63,7 @@ public final class MxGatewayClient implements AutoCloseable {
} }
public MxAccessGatewayGrpc.MxAccessGatewayStub rawAsyncStub() { public MxAccessGatewayGrpc.MxAccessGatewayStub rawAsyncStub() {
return withDeadline(asyncStub); return asyncStub;
} }
public MxGatewaySession openSession(OpenSessionRequest request) { public MxGatewaySession openSession(OpenSessionRequest request) {
@@ -140,14 +140,14 @@ public final class MxGatewayClient implements AutoCloseable {
public MxEventStream streamEvents(StreamEventsRequest request) { public MxEventStream streamEvents(StreamEventsRequest request) {
MxEventStream stream = new MxEventStream(16); MxEventStream stream = new MxEventStream(16);
rawAsyncStub().streamEvents(request, stream.observer()); withStreamDeadline(rawAsyncStub()).streamEvents(request, stream.observer());
return stream; return stream;
} }
public MxGatewayEventSubscription streamEventsAsync( public MxGatewayEventSubscription streamEventsAsync(
StreamEventsRequest request, StreamObserver<MxEvent> observer) { StreamEventsRequest request, StreamObserver<MxEvent> observer) {
MxGatewayEventSubscription subscription = new MxGatewayEventSubscription(); MxGatewayEventSubscription subscription = new MxGatewayEventSubscription();
rawAsyncStub().streamEvents(request, subscription.wrap(observer)); withStreamDeadline(rawAsyncStub()).streamEvents(request, subscription.wrap(observer));
return subscription; return subscription;
} }
@@ -161,7 +161,9 @@ public final class MxGatewayClient implements AutoCloseable {
public void closeAndAwaitTermination() throws InterruptedException { public void closeAndAwaitTermination() throws InterruptedException {
if (ownedChannel != null) { if (ownedChannel != null) {
ownedChannel.shutdown(); ownedChannel.shutdown();
ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS); if (!ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS)) {
ownedChannel.shutdownNow();
}
} }
} }
@@ -199,6 +201,13 @@ public final class MxGatewayClient implements AutoCloseable {
return stub.withDeadlineAfter(options.callTimeout().toNanos(), TimeUnit.NANOSECONDS); return stub.withDeadlineAfter(options.callTimeout().toNanos(), TimeUnit.NANOSECONDS);
} }
private <T extends io.grpc.stub.AbstractStub<T>> T withStreamDeadline(T stub) {
if (options.streamTimeout() == null || options.streamTimeout().isNegative()) {
return stub;
}
return stub.withDeadlineAfter(options.streamTimeout().toNanos(), TimeUnit.NANOSECONDS);
}
private static <T> CompletableFuture<T> toCompletable(com.google.common.util.concurrent.ListenableFuture<T> source) { private static <T> CompletableFuture<T> toCompletable(com.google.common.util.concurrent.ListenableFuture<T> source) {
CompletableFuture<T> target = new CompletableFuture<>(); CompletableFuture<T> target = new CompletableFuture<>();
Futures.addCallback( Futures.addCallback(
@@ -219,6 +228,11 @@ public final class MxGatewayClient implements AutoCloseable {
} }
}, },
MoreExecutors.directExecutor()); MoreExecutors.directExecutor());
target.whenComplete((ignoredResult, ignoredError) -> {
if (target.isCancelled()) {
source.cancel(true);
}
});
return target; return target;
} }
@@ -15,6 +15,7 @@ public final class MxGatewayClientOptions {
private final String serverNameOverride; private final String serverNameOverride;
private final Duration connectTimeout; private final Duration connectTimeout;
private final Duration callTimeout; private final Duration callTimeout;
private final Duration streamTimeout;
private MxGatewayClientOptions(Builder builder) { private MxGatewayClientOptions(Builder builder) {
endpoint = requireText(builder.endpoint, "endpoint"); endpoint = requireText(builder.endpoint, "endpoint");
@@ -24,6 +25,7 @@ public final class MxGatewayClientOptions {
serverNameOverride = builder.serverNameOverride == null ? "" : builder.serverNameOverride; serverNameOverride = builder.serverNameOverride == null ? "" : builder.serverNameOverride;
connectTimeout = builder.connectTimeout == null ? DEFAULT_CONNECT_TIMEOUT : builder.connectTimeout; connectTimeout = builder.connectTimeout == null ? DEFAULT_CONNECT_TIMEOUT : builder.connectTimeout;
callTimeout = builder.callTimeout == null ? DEFAULT_CALL_TIMEOUT : builder.callTimeout; callTimeout = builder.callTimeout == null ? DEFAULT_CALL_TIMEOUT : builder.callTimeout;
streamTimeout = builder.streamTimeout;
} }
public static Builder builder() { public static Builder builder() {
@@ -62,6 +64,10 @@ public final class MxGatewayClientOptions {
return callTimeout; return callTimeout;
} }
public Duration streamTimeout() {
return streamTimeout;
}
@Override @Override
public String toString() { public String toString() {
return "MxGatewayClientOptions{" return "MxGatewayClientOptions{"
@@ -82,6 +88,8 @@ public final class MxGatewayClientOptions {
+ connectTimeout + connectTimeout
+ ", callTimeout=" + ", callTimeout="
+ callTimeout + callTimeout
+ ", streamTimeout="
+ streamTimeout
+ '}'; + '}';
} }
@@ -100,6 +108,7 @@ public final class MxGatewayClientOptions {
private String serverNameOverride; private String serverNameOverride;
private Duration connectTimeout; private Duration connectTimeout;
private Duration callTimeout; private Duration callTimeout;
private Duration streamTimeout;
private Builder() { private Builder() {
} }
@@ -139,6 +148,11 @@ public final class MxGatewayClientOptions {
return this; return this;
} }
public Builder streamTimeout(Duration value) {
streamTimeout = Objects.requireNonNull(value, "streamTimeout");
return this;
}
public MxGatewayClientOptions build() { public MxGatewayClientOptions build() {
return new MxGatewayClientOptions(this); return new MxGatewayClientOptions(this);
} }
@@ -4,17 +4,22 @@ import io.grpc.stub.ClientCallStreamObserver;
import io.grpc.stub.ClientResponseObserver; import io.grpc.stub.ClientResponseObserver;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.AtomicBoolean;
import mxaccess_gateway.v1.MxaccessGateway.MxEvent; import mxaccess_gateway.v1.MxaccessGateway.MxEvent;
import mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest; import mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest;
public final class MxGatewayEventSubscription implements AutoCloseable { public final class MxGatewayEventSubscription implements AutoCloseable {
private final AtomicReference<ClientCallStreamObserver<StreamEventsRequest>> requestStream = new AtomicReference<>(); private final AtomicReference<ClientCallStreamObserver<StreamEventsRequest>> requestStream = new AtomicReference<>();
private final AtomicBoolean cancelled = new AtomicBoolean();
ClientResponseObserver<StreamEventsRequest, MxEvent> wrap(StreamObserver<MxEvent> observer) { ClientResponseObserver<StreamEventsRequest, MxEvent> wrap(StreamObserver<MxEvent> observer) {
return new ClientResponseObserver<>() { return new ClientResponseObserver<>() {
@Override @Override
public void beforeStart(ClientCallStreamObserver<StreamEventsRequest> stream) { public void beforeStart(ClientCallStreamObserver<StreamEventsRequest> stream) {
requestStream.set(stream); requestStream.set(stream);
if (cancelled.get()) {
stream.cancel("client cancelled event stream", null);
}
} }
@Override @Override
@@ -35,6 +40,7 @@ public final class MxGatewayEventSubscription implements AutoCloseable {
} }
public void cancel() { public void cancel() {
cancelled.set(true);
ClientCallStreamObserver<StreamEventsRequest> stream = requestStream.get(); ClientCallStreamObserver<StreamEventsRequest> stream = requestStream.get();
if (stream != null) { if (stream != null) {
stream.cancel("client cancelled event stream", null); stream.cancel("client cancelled event stream", null);
+15 -9
View File
@@ -74,9 +74,9 @@ class GatewayClient:
if self._closed: if self._closed:
return return
self._closed = True
if self._channel is not None: if self._channel is not None:
await self._channel.close() await self._channel.close()
self._closed = True
async def open_session( async def open_session(
self, self,
@@ -124,10 +124,10 @@ class GatewayClient:
) -> AsyncIterator[pb.MxEvent]: ) -> AsyncIterator[pb.MxEvent]:
"""Return an async event iterator and cancel the stream when iteration stops.""" """Return an async event iterator and cancel the stream when iteration stops."""
call = self.raw_stub.StreamEvents( kwargs: dict[str, Any] = {"metadata": merge_metadata(self.options.api_key, metadata)}
request, if self.options.stream_timeout is not None:
metadata=merge_metadata(self.options.api_key, metadata), kwargs["timeout"] = self.options.stream_timeout
) call = self.raw_stub.StreamEvents(request, **kwargs)
return _canceling_iterator(call) return _canceling_iterator(call)
async def _unary( async def _unary(
@@ -138,10 +138,16 @@ class GatewayClient:
*, *,
metadata: Sequence[tuple[str, str]] | None = None, metadata: Sequence[tuple[str, str]] | None = None,
) -> Any: ) -> Any:
call = method( kwargs: dict[str, Any] = {"metadata": merge_metadata(self.options.api_key, metadata)}
request, if self.options.call_timeout is not None:
metadata=merge_metadata(self.options.api_key, metadata), kwargs["timeout"] = self.options.call_timeout
) try:
call = method(request, **kwargs)
except TypeError as error:
if "timeout" not in kwargs or "unexpected keyword argument 'timeout'" not in str(error):
raise
kwargs.pop("timeout")
call = method(request, **kwargs)
try: try:
return await call return await call
except asyncio.CancelledError: except asyncio.CancelledError:
+9 -1
View File
@@ -19,6 +19,8 @@ class ClientOptions:
plaintext: bool = False plaintext: bool = False
ca_file: str | None = None ca_file: str | None = None
server_name_override: str | None = None server_name_override: str | None = None
call_timeout: float | None = 30.0
stream_timeout: float | None = None
def __post_init__(self) -> None: def __post_init__(self) -> None:
if not self.endpoint: if not self.endpoint:
@@ -26,6 +28,10 @@ class ClientOptions:
if self.plaintext and self.ca_file: if self.plaintext and self.ca_file:
raise ValueError("ca_file cannot be used with plaintext connections") raise ValueError("ca_file cannot be used with plaintext connections")
if self.call_timeout is not None and self.call_timeout <= 0:
raise ValueError("call_timeout must be greater than zero")
if self.stream_timeout is not None and self.stream_timeout <= 0:
raise ValueError("stream_timeout must be greater than zero")
def __repr__(self) -> str: def __repr__(self) -> str:
api_key = REDACTED if self.api_key else None api_key = REDACTED if self.api_key else None
@@ -33,7 +39,9 @@ class ClientOptions:
f"{type(self).__name__}(endpoint={self.endpoint!r}, " f"{type(self).__name__}(endpoint={self.endpoint!r}, "
f"api_key={api_key!r}, plaintext={self.plaintext!r}, " f"api_key={api_key!r}, plaintext={self.plaintext!r}, "
f"ca_file={self.ca_file!r}, " f"ca_file={self.ca_file!r}, "
f"server_name_override={self.server_name_override!r})" f"server_name_override={self.server_name_override!r}, "
f"call_timeout={self.call_timeout!r}, "
f"stream_timeout={self.stream_timeout!r})"
) )
+16 -2
View File
@@ -8,6 +8,8 @@ from .errors import ensure_mxaccess_success
from .generated import mxaccess_gateway_pb2 as pb from .generated import mxaccess_gateway_pb2 as pb
from .values import MxValueInput, to_mx_value from .values import MxValueInput, to_mx_value
MAX_BULK_ITEMS = 1000
class Session: class Session:
"""A single gateway-backed MXAccess session.""" """A single gateway-backed MXAccess session."""
@@ -40,13 +42,14 @@ class Session:
protocol_status=pb.ProtocolStatus(code=pb.PROTOCOL_STATUS_CODE_OK), protocol_status=pb.ProtocolStatus(code=pb.PROTOCOL_STATUS_CODE_OK),
) )
self._closed = True reply = await self.client.close_session_raw(
return await self.client.close_session_raw(
pb.CloseSessionRequest( pb.CloseSessionRequest(
session_id=self.session_id, session_id=self.session_id,
client_correlation_id=client_correlation_id, client_correlation_id=client_correlation_id,
), ),
) )
self._closed = True
return reply
async def invoke(self, command: pb.MxCommand, *, correlation_id: str = "") -> pb.MxCommandReply: async def invoke(self, command: pb.MxCommand, *, correlation_id: str = "") -> pb.MxCommandReply:
"""Invoke a raw command and enforce gateway and MXAccess success.""" """Invoke a raw command and enforce gateway and MXAccess success."""
@@ -192,6 +195,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if tag_addresses is None: if tag_addresses is None:
raise TypeError("tag_addresses is required") raise TypeError("tag_addresses is required")
_ensure_bulk_size("tag_addresses", len(tag_addresses))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_ADD_ITEM_BULK, kind=pb.MX_COMMAND_KIND_ADD_ITEM_BULK,
@@ -213,6 +217,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if item_handles is None: if item_handles is None:
raise TypeError("item_handles is required") raise TypeError("item_handles is required")
_ensure_bulk_size("item_handles", len(item_handles))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_ADVISE_ITEM_BULK, kind=pb.MX_COMMAND_KIND_ADVISE_ITEM_BULK,
@@ -234,6 +239,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if item_handles is None: if item_handles is None:
raise TypeError("item_handles is required") raise TypeError("item_handles is required")
_ensure_bulk_size("item_handles", len(item_handles))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_REMOVE_ITEM_BULK, kind=pb.MX_COMMAND_KIND_REMOVE_ITEM_BULK,
@@ -255,6 +261,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if item_handles is None: if item_handles is None:
raise TypeError("item_handles is required") raise TypeError("item_handles is required")
_ensure_bulk_size("item_handles", len(item_handles))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_UN_ADVISE_ITEM_BULK, kind=pb.MX_COMMAND_KIND_UN_ADVISE_ITEM_BULK,
@@ -276,6 +283,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if tag_addresses is None: if tag_addresses is None:
raise TypeError("tag_addresses is required") raise TypeError("tag_addresses is required")
_ensure_bulk_size("tag_addresses", len(tag_addresses))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_SUBSCRIBE_BULK, kind=pb.MX_COMMAND_KIND_SUBSCRIBE_BULK,
@@ -297,6 +305,7 @@ class Session:
) -> list[pb.SubscribeResult]: ) -> list[pb.SubscribeResult]:
if item_handles is None: if item_handles is None:
raise TypeError("item_handles is required") raise TypeError("item_handles is required")
_ensure_bulk_size("item_handles", len(item_handles))
reply = await self.invoke( reply = await self.invoke(
pb.MxCommand( pb.MxCommand(
kind=pb.MX_COMMAND_KIND_UNSUBSCRIBE_BULK, kind=pb.MX_COMMAND_KIND_UNSUBSCRIBE_BULK,
@@ -368,4 +377,9 @@ class Session:
) )
def _ensure_bulk_size(name: str, count: int) -> None:
if count > MAX_BULK_ITEMS:
raise ValueError(f"{name} bulk commands are limited to {MAX_BULK_ITEMS} item(s)")
from .client import GatewayClient # noqa: E402 from .client import GatewayClient # noqa: E402
@@ -20,6 +20,8 @@ from mxgateway.generated import mxaccess_gateway_pb2 as pb
from mxgateway.options import ClientOptions from mxgateway.options import ClientOptions
from mxgateway.values import MxValueInput from mxgateway.values import MxValueInput
MAX_AGGREGATE_EVENTS = 10_000
@click.group() @click.group()
def main() -> None: def main() -> None:
@@ -55,6 +57,8 @@ def gateway_options(command: Callable[..., Any]) -> Callable[..., Any]:
default=None, default=None,
help="TLS server name override for test environments.", help="TLS server name override for test environments.",
)(command) )(command)
command = click.option("--call-timeout", default=30.0, type=float, show_default=True)(command)
command = click.option("--stream-timeout", default=None, type=float)(command)
return command return command
@@ -352,6 +356,8 @@ async def _connect(kwargs: dict[str, Any]) -> GatewayClient:
plaintext=_use_plaintext(kwargs), plaintext=_use_plaintext(kwargs),
ca_file=kwargs.get("ca_file"), ca_file=kwargs.get("ca_file"),
server_name_override=kwargs.get("server_name_override"), server_name_override=kwargs.get("server_name_override"),
call_timeout=kwargs.get("call_timeout"),
stream_timeout=kwargs.get("stream_timeout"),
), ),
) )
@@ -416,6 +422,12 @@ async def _collect_events(
max_events: int, max_events: int,
timeout: float, timeout: float,
) -> list[pb.MxEvent]: ) -> list[pb.MxEvent]:
if max_events > MAX_AGGREGATE_EVENTS:
raise click.BadParameter(
f"must be less than or equal to {MAX_AGGREGATE_EVENTS}",
param_hint="--max-events",
)
collected: list[pb.MxEvent] = [] collected: list[pb.MxEvent] = []
iterator = events.__aiter__() iterator = events.__aiter__()
try: try:
@@ -423,6 +435,8 @@ async def _collect_events(
collected.append(await asyncio.wait_for(iterator.__anext__(), timeout=timeout)) collected.append(await asyncio.wait_for(iterator.__anext__(), timeout=timeout))
except StopAsyncIteration: except StopAsyncIteration:
pass pass
except asyncio.TimeoutError:
pass
finally: finally:
close = getattr(iterator, "aclose", None) close = getattr(iterator, "aclose", None)
if close is not None: if close is not None:
+30 -7
View File
@@ -16,6 +16,8 @@ use mxgateway_client::{
use serde_json::json; use serde_json::json;
use serde_json::Value; use serde_json::Value;
const MAX_AGGREGATE_EVENTS: usize = 10_000;
#[derive(Debug, Parser)] #[derive(Debug, Parser)]
#[command(name = "mxgw")] #[command(name = "mxgw")]
#[command(about = "MXAccess Gateway Rust test CLI")] #[command(about = "MXAccess Gateway Rust test CLI")]
@@ -29,6 +31,8 @@ enum Command {
Version { Version {
#[arg(long)] #[arg(long)]
json: bool, json: bool,
#[arg(long)]
jsonl: bool,
}, },
Ping { Ping {
#[command(flatten)] #[command(flatten)]
@@ -325,7 +329,15 @@ async fn run(cli: Cli) -> Result<(), Error> {
after_worker_sequence, after_worker_sequence,
max_events, max_events,
json, json,
jsonl,
} => { } => {
if max_events > MAX_AGGREGATE_EVENTS {
return Err(Error::InvalidArgument {
name: "max-events".to_owned(),
detail: format!("must be less than or equal to {MAX_AGGREGATE_EVENTS}"),
});
}
let client = connect(connection).await?; let client = connect(connection).await?;
let mut stream = client let mut stream = client
.stream_events(StreamEventsRequest { .stream_events(StreamEventsRequest {
@@ -334,19 +346,30 @@ async fn run(cli: Cli) -> Result<(), Error> {
}) })
.await?; .await?;
let mut events = Vec::new(); let mut events = Vec::new();
while events.len() < max_events { let mut event_count = 0usize;
while event_count < max_events {
let Some(event) = stream.next().await else { let Some(event) = stream.next().await else {
break; break;
}; };
events.push(event?); let event = event?;
} event_count += 1;
if json { if jsonl {
println!("{}", json!({ "eventCount": events.len() })); println!(
} else { "{}",
for event in events { json!({
"workerSequence": event.worker_sequence,
"family": event.family,
})
);
} else if json {
events.push(event);
} else {
println!("{} {}", event.worker_sequence, event.family); println!("{} {}", event.worker_sequence, event.family);
} }
} }
if json {
println!("{}", json!({ "eventCount": event_count }));
}
} }
Command::Write { Command::Write {
connection, connection,
+14 -2
View File
@@ -5,7 +5,7 @@ use tonic::transport::{Certificate, Channel, ClientTlsConfig};
use tonic::Request; use tonic::Request;
use crate::auth::AuthInterceptor; use crate::auth::AuthInterceptor;
use crate::error::{ensure_command_success, Error}; use crate::error::{ensure_command_success, ensure_protocol_success, Error};
use crate::generated::mxaccess_gateway::v1::mx_access_gateway_client::MxAccessGatewayClient; use crate::generated::mxaccess_gateway::v1::mx_access_gateway_client::MxAccessGatewayClient;
use crate::generated::mxaccess_gateway::v1::{ use crate::generated::mxaccess_gateway::v1::{
CloseSessionReply, CloseSessionRequest, MxCommandReply, MxCommandRequest, MxEvent, CloseSessionReply, CloseSessionRequest, MxCommandReply, MxCommandRequest, MxEvent,
@@ -23,6 +23,7 @@ pub type EventStream =
pub struct GatewayClient { pub struct GatewayClient {
inner: RawGatewayClient, inner: RawGatewayClient,
call_timeout: std::time::Duration, call_timeout: std::time::Duration,
stream_timeout: Option<std::time::Duration>,
} }
impl GatewayClient { impl GatewayClient {
@@ -57,6 +58,7 @@ impl GatewayClient {
Ok(Self { Ok(Self {
inner: MxAccessGatewayClient::with_interceptor(channel, interceptor), inner: MxAccessGatewayClient::with_interceptor(channel, interceptor),
call_timeout: options.call_timeout(), call_timeout: options.call_timeout(),
stream_timeout: options.stream_timeout(),
}) })
} }
@@ -83,6 +85,7 @@ impl GatewayClient {
pub async fn open_session(&self, request: OpenSessionRequest) -> Result<Session, Error> { pub async fn open_session(&self, request: OpenSessionRequest) -> Result<Session, Error> {
let reply = self.open_session_raw(request).await?; let reply = self.open_session_raw(request).await?;
ensure_protocol_success("open session", reply.protocol_status.as_ref())?;
Ok(Session::new(reply.session_id, self.clone())) Ok(Session::new(reply.session_id, self.clone()))
} }
@@ -107,7 +110,7 @@ impl GatewayClient {
pub async fn stream_events(&self, request: StreamEventsRequest) -> Result<EventStream, Error> { pub async fn stream_events(&self, request: StreamEventsRequest) -> Result<EventStream, Error> {
let mut client = self.inner.clone(); let mut client = self.inner.clone();
let response = client.stream_events(self.unary_request(request)).await?; let response = client.stream_events(self.stream_request(request)).await?;
let stream = futures_util::StreamExt::map(response.into_inner(), |result| { let stream = futures_util::StreamExt::map(response.into_inner(), |result| {
result.map_err(Error::from) result.map_err(Error::from)
}); });
@@ -120,4 +123,13 @@ impl GatewayClient {
request.set_timeout(self.call_timeout); request.set_timeout(self.call_timeout);
request request
} }
fn stream_request<T>(&self, message: T) -> Request<T> {
let mut request = Request::new(message);
if let Some(timeout) = self.stream_timeout {
request.set_timeout(timeout);
}
request
}
} }
+29 -1
View File
@@ -1,7 +1,7 @@
use thiserror::Error as ThisError; use thiserror::Error as ThisError;
use tonic::Code; use tonic::Code;
use crate::generated::mxaccess_gateway::v1::{MxCommandReply, ProtocolStatusCode}; use crate::generated::mxaccess_gateway::v1::{MxCommandReply, ProtocolStatus, ProtocolStatusCode};
#[derive(Debug, ThisError)] #[derive(Debug, ThisError)]
pub enum Error { pub enum Error {
@@ -47,6 +47,13 @@ pub enum Error {
#[error("gateway command failed: {0}")] #[error("gateway command failed: {0}")]
Command(#[from] Box<CommandError>), Command(#[from] Box<CommandError>),
#[error("gateway {operation} failed: {code:?}: {message}")]
ProtocolStatus {
operation: &'static str,
code: ProtocolStatusCode,
message: String,
},
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@@ -125,6 +132,27 @@ pub fn ensure_command_success(reply: MxCommandReply) -> Result<MxCommandReply, E
} }
} }
pub fn ensure_protocol_success(
operation: &'static str,
status: Option<&ProtocolStatus>,
) -> Result<(), Error> {
let code = status
.and_then(|status| ProtocolStatusCode::try_from(status.code).ok())
.unwrap_or(ProtocolStatusCode::Unspecified);
if code == ProtocolStatusCode::Ok {
Ok(())
} else {
Err(Error::ProtocolStatus {
operation,
code,
message: status
.map(|status| status.message.clone())
.unwrap_or_default(),
})
}
}
fn redact_credentials(message: &str) -> String { fn redact_credentials(message: &str) -> String {
message message
.split_whitespace() .split_whitespace()
+12
View File
@@ -13,6 +13,7 @@ pub struct ClientOptions {
server_name_override: Option<String>, server_name_override: Option<String>,
connect_timeout: Duration, connect_timeout: Duration,
call_timeout: Duration, call_timeout: Duration,
stream_timeout: Option<Duration>,
} }
impl ClientOptions { impl ClientOptions {
@@ -25,6 +26,7 @@ impl ClientOptions {
server_name_override: None, server_name_override: None,
connect_timeout: Duration::from_secs(10), connect_timeout: Duration::from_secs(10),
call_timeout: Duration::from_secs(30), call_timeout: Duration::from_secs(30),
stream_timeout: None,
} }
} }
@@ -58,6 +60,11 @@ impl ClientOptions {
self self
} }
pub fn with_stream_timeout(mut self, stream_timeout: Duration) -> Self {
self.stream_timeout = Some(stream_timeout);
self
}
pub fn endpoint(&self) -> &str { pub fn endpoint(&self) -> &str {
&self.endpoint &self.endpoint
} }
@@ -85,6 +92,10 @@ impl ClientOptions {
pub fn call_timeout(&self) -> Duration { pub fn call_timeout(&self) -> Duration {
self.call_timeout self.call_timeout
} }
pub fn stream_timeout(&self) -> Option<Duration> {
self.stream_timeout
}
} }
impl Default for ClientOptions { impl Default for ClientOptions {
@@ -104,6 +115,7 @@ impl fmt::Debug for ClientOptions {
.field("server_name_override", &self.server_name_override) .field("server_name_override", &self.server_name_override)
.field("connect_timeout", &self.connect_timeout) .field("connect_timeout", &self.connect_timeout)
.field("call_timeout", &self.call_timeout) .field("call_timeout", &self.call_timeout)
.field("stream_timeout", &self.stream_timeout)
.finish() .finish()
} }
} }
+23 -2
View File
@@ -1,5 +1,5 @@
use crate::client::{EventStream, GatewayClient}; use crate::client::{EventStream, GatewayClient};
use crate::error::Error; use crate::error::{ensure_protocol_success, Error};
use crate::generated::mxaccess_gateway::v1::mx_command::Payload; use crate::generated::mxaccess_gateway::v1::mx_command::Payload;
use crate::generated::mxaccess_gateway::v1::mx_command_reply; use crate::generated::mxaccess_gateway::v1::mx_command_reply;
use crate::generated::mxaccess_gateway::v1::{ use crate::generated::mxaccess_gateway::v1::{
@@ -11,6 +11,8 @@ use crate::generated::mxaccess_gateway::v1::{
}; };
use crate::value::MxValue; use crate::value::MxValue;
const MAX_BULK_ITEMS: usize = 1_000;
/// Session identifier returned by the gateway. /// Session identifier returned by the gateway.
#[derive(Clone)] #[derive(Clone)]
pub struct Session { pub struct Session {
@@ -40,12 +42,14 @@ impl Session {
} }
pub async fn close(&self) -> Result<(), Error> { pub async fn close(&self) -> Result<(), Error> {
self.client let reply = self
.client
.close_session_raw(CloseSessionRequest { .close_session_raw(CloseSessionRequest {
session_id: self.id.clone(), session_id: self.id.clone(),
client_correlation_id: "rust-client-close-session".to_owned(), client_correlation_id: "rust-client-close-session".to_owned(),
}) })
.await?; .await?;
ensure_protocol_success("close session", reply.protocol_status.as_ref())?;
Ok(()) Ok(())
} }
@@ -137,6 +141,7 @@ impl Session {
server_handle: i32, server_handle: i32,
tag_addresses: Vec<String>, tag_addresses: Vec<String>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("tag_addresses", tag_addresses.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::AddItemBulk, MxCommandKind::AddItemBulk,
@@ -155,6 +160,7 @@ impl Session {
server_handle: i32, server_handle: i32,
item_handles: Vec<i32>, item_handles: Vec<i32>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("item_handles", item_handles.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::AdviseItemBulk, MxCommandKind::AdviseItemBulk,
@@ -173,6 +179,7 @@ impl Session {
server_handle: i32, server_handle: i32,
item_handles: Vec<i32>, item_handles: Vec<i32>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("item_handles", item_handles.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::RemoveItemBulk, MxCommandKind::RemoveItemBulk,
@@ -191,6 +198,7 @@ impl Session {
server_handle: i32, server_handle: i32,
item_handles: Vec<i32>, item_handles: Vec<i32>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("item_handles", item_handles.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::UnAdviseItemBulk, MxCommandKind::UnAdviseItemBulk,
@@ -209,6 +217,7 @@ impl Session {
server_handle: i32, server_handle: i32,
tag_addresses: Vec<String>, tag_addresses: Vec<String>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("tag_addresses", tag_addresses.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::SubscribeBulk, MxCommandKind::SubscribeBulk,
@@ -227,6 +236,7 @@ impl Session {
server_handle: i32, server_handle: i32,
item_handles: Vec<i32>, item_handles: Vec<i32>,
) -> Result<Vec<SubscribeResult>, Error> { ) -> Result<Vec<SubscribeResult>, Error> {
ensure_bulk_size("item_handles", item_handles.len())?;
let reply = self let reply = self
.invoke( .invoke(
MxCommandKind::UnsubscribeBulk, MxCommandKind::UnsubscribeBulk,
@@ -327,6 +337,17 @@ impl Session {
} }
} }
fn ensure_bulk_size(name: &'static str, len: usize) -> Result<(), Error> {
if len > MAX_BULK_ITEMS {
Err(Error::InvalidArgument {
name: name.to_owned(),
detail: format!("bulk commands are limited to {MAX_BULK_ITEMS} item(s)"),
})
} else {
Ok(())
}
}
fn register_server_handle(reply: &MxCommandReply) -> i32 { fn register_server_handle(reply: &MxCommandReply) -> i32 {
match reply.payload.as_ref() { match reply.payload.as_ref() {
Some(mx_command_reply::Payload::Register(register)) => register.server_handle, Some(mx_command_reply::Payload::Register(register)) => register.server_handle,
+294
View File
@@ -0,0 +1,294 @@
# Dashboard Interface Design
This guide describes the visual and interaction patterns used by the MXAccess
Gateway dashboard so the same interface style can be reused in other
operations-focused projects.
## Design Goal
The dashboard is an operational interface, not a landing page. It prioritizes
fast scanning, low visual noise, and stable layouts while live data changes.
The design uses Bootstrap for common behavior and a small local stylesheet for
project identity, spacing, and status presentation.
Use this style for applications where users repeatedly check system state,
compare rows, inspect details, and diagnose faults. Avoid promotional layouts,
large hero areas, decorative imagery, or oversized cards that reduce data
density.
## Visual Language
The interface uses a quiet, work-focused visual system:
- A light gray page background separates the application shell from white data
surfaces.
- White cards and sections carry the actual operational content.
- Borders define structure more often than shadows.
- Accent color is reserved for metric values and important numeric signals.
- Bootstrap status badges provide state color without custom status art.
- Tables remain compact and responsive so long identifiers and timestamps stay
readable.
The resulting page should look like a control surface: restrained, predictable,
and dense enough for repeated use.
## Layout Structure
Every page follows the same structure:
1. A top navigation bar with the product or service name on the left.
2. A full-width `container-fluid` content area.
3. A page header with the page title, short context text, and optional status
badge.
4. Metric cards when a page has top-level numeric state.
5. Bordered content sections for tables, details, faults, or empty states.
The shell does not use a sidebar. A horizontal navigation bar is enough for the
current page count and keeps the content width available for tables.
```html
<div class="dashboard-shell">
<nav class="navbar navbar-expand-lg bg-body border-bottom dashboard-navbar">
<!-- brand, page links, sign-out action -->
</nav>
<main class="container-fluid dashboard-content">
<!-- page header, metric grid, sections -->
</main>
</div>
```
## Color Tokens
Use a small token set and let Bootstrap provide the rest. The current dashboard
uses these local tokens:
```css
:root {
--mxgw-surface: #f7f8fa;
--mxgw-border: #d8dee6;
--mxgw-ink-muted: #667085;
--mxgw-accent: #146c64;
}
```
| Token | Purpose |
|-------|---------|
| `--mxgw-surface` | Page background behind all content. |
| `--mxgw-border` | Borders on cards, tables, sections, and empty states. |
| `--mxgw-ink-muted` | Secondary labels, details, and empty-state text. |
| `--mxgw-accent` | Metric values and important numeric summaries. |
Keep the palette small. Add new colors only when they encode state or improve
readability. Prefer Bootstrap badge classes for states such as ready, closing,
closed, and faulted.
## Typography
Typography stays compact and consistent:
- Page headings use `1.35rem`, weight `650`, and normal letter spacing.
- Section headings use the same size as page headings when they introduce a
table or details group.
- Metric labels use uppercase text at `.78rem` and weight `650`.
- Metric values use `1.7rem`, weight `700`, and the accent color.
- Body and table text inherit Bootstrap defaults for readability.
Do not scale text with viewport width. Long values use `overflow-wrap:
anywhere` so session IDs, paths, and fault messages do not break the layout.
## Spacing And Shape
The dashboard uses modest spacing:
- Page content has `1.25rem` padding on desktop and `.75rem` on small screens.
- Metric grids use `.75rem` gaps.
- Content sections start with a top border and `1rem` top padding.
- Cards and empty states use Bootstrap's small radius shape, `.375rem`.
- Metric cards have no shadow.
This keeps information grouped without turning each section into a decorative
panel. Use cards for repeated metric summaries, login forms, and individual
items. Use unframed sections with a top border for page-level groups.
## Navigation
Navigation is a Bootstrap responsive navbar. It includes:
- Brand text for the service name.
- Short page labels: `Overview`, `Sessions`, `Workers`, `Events`, `Settings`.
- Active route styling through `NavLink`.
- A right-aligned sign-out button when authentication is enabled.
Keep navigation labels short. Operational users should be able to predict what
each page contains without reading explanatory copy.
## Page Headers
Each page starts with a `dashboard-page-header`:
- The title is the primary anchor.
- A single secondary line gives timestamp, row count, or configuration context.
- A status badge appears on the right when the page has an overall state.
On narrow screens, the header stacks vertically. This prevents long context
text or status badges from overlapping the title.
```html
<div class="dashboard-page-header">
<div>
<h1>Overview</h1>
<div class="text-secondary">Generated 2026-04-27 17:30:00</div>
</div>
<span class="badge text-bg-success">Healthy</span>
</div>
```
## Metric Cards
Metric cards summarize numeric state at the top of overview and diagnostic
pages. They use Bootstrap cards with a local `metric-card` class:
- Label: uppercase, muted, compact.
- Value: large enough to scan, accent colored, wraps safely.
- Detail: optional muted text for version, rate context, or explanatory state.
Use auto-fit CSS grid tracks so the cards fill available width without custom
breakpoints:
```css
.metric-grid {
display: grid;
gap: .75rem;
grid-template-columns: repeat(auto-fit, minmax(12rem, 1fr));
}
.metric-grid.compact {
grid-template-columns: repeat(auto-fit, minmax(10rem, 1fr));
}
```
Metrics should be formatted before rendering. Counts use thousands separators,
durations use stable units, and missing values render as `-`.
## Tables
Tables are the main information surface. Use Bootstrap `table table-sm` with a
local `dashboard-table` class:
- `table-sm` keeps rows dense.
- `align-middle` improves status badge alignment.
- `table-responsive` wraps every table that can exceed the viewport.
- Header cells use weight `650` and no wrapping.
- Body cells allow wrapping so identifiers, paths, and messages stay visible.
- Detail tables reserve a fixed header width.
Use code formatting for machine identifiers such as session IDs, file paths,
and protocol values. Link rows only where navigation is useful; avoid making
entire rows clickable when a single identifier link is clearer.
## Status Badges
Status uses Bootstrap badge classes with a small mapping layer:
| State | Badge class |
|-------|-------------|
| `Ready`, `Healthy` | `text-bg-success` |
| `Creating`, `StartingWorker`, `WaitingForPipe`, `InitializingWorker`, `Closing` | `text-bg-info` |
| `Closed` | `text-bg-secondary` |
| `Faulted` | `text-bg-danger` |
| Unknown state | `text-bg-light text-dark border` |
Keep status text literal. Operators benefit from seeing the same state names
that appear in logs and APIs.
## Empty And Loading States
Empty states are explicit and quiet. They use a white background, dashed border,
small radius, muted text, and one sentence:
```html
<div class="empty-state">No worker processes are attached.</div>
```
Loading states use the same component shape. Avoid spinners for snapshot pages
that update on a timer; a stable text placeholder is less distracting.
## Detail Pages
Detail pages use stacked sections instead of nested cards:
- The page header identifies the selected entity.
- The first section shows entity metadata in a two-column details table.
- Additional sections show related runtime state, such as worker metadata.
- Missing entities render a single section with a concise not-found message.
This structure keeps details comparable across pages and avoids card nesting.
## Responsive Behavior
The dashboard uses one small-screen breakpoint:
```css
@media (max-width: 700px) {
.dashboard-content {
padding: .75rem;
}
.dashboard-page-header {
align-items: flex-start;
flex-direction: column;
}
.details-table th {
width: 9rem;
}
}
```
Do not hide important columns by default. Use horizontal table scrolling for
dense operational data, and reserve column hiding for data that is clearly
duplicative.
## Data Formatting
Use a small display helper instead of formatting inline in every component.
The helper should provide consistent rendering for:
- empty text as `-`,
- counts with thousands separators,
- dates and times in a consistent local or configured format,
- durations in stable units,
- metric lookup by name and dimension.
Centralizing formatting prevents visual drift between overview cards, tables,
and detail pages.
## Security And Redaction
The interface is read-only unless an explicit administrative action is
designed. It should not display secrets or raw credential-bearing values.
Apply redaction before values reach Razor components. The UI treats redacted
values as normal display text; it does not need to know why a value is hidden.
This keeps security policy in the dashboard projection layer rather than in
markup.
## Replication Checklist
Use this checklist when applying the design to another project:
- Define four local tokens: surface, border, muted ink, and accent.
- Use a Bootstrap top navbar with short route labels.
- Keep page content inside a full-width fluid container.
- Start every page with the same header structure.
- Put primary numeric state in `metric-grid` cards.
- Put detailed runtime state in compact responsive tables.
- Use status badges mapped from real domain states.
- Use dashed bordered empty states for loading and no-data cases.
- Use top-bordered sections for page groups instead of nested cards.
- Centralize formatting and redaction outside Razor markup.
- Keep the dashboard read-only until admin workflows have a separate design.
## Related Documentation
- [Gateway Dashboard Detailed Design](./gateway-dashboard-design.md)
+153
View File
@@ -0,0 +1,153 @@
# Gateway Configuration
This document describes every option bound under the `MxGateway` configuration
section by `GatewayOptions`.
The gateway binds configuration at startup and validates it with
`GatewayOptionsValidator`. Startup fails before the server listens when required
paths, timeouts, queue sizes, enum values, or protocol values are invalid.
## Configuration Shape
```json
{
"MxGateway": {
"Authentication": {
"Mode": "ApiKey",
"SqlitePath": "C:\\ProgramData\\MxGateway\\gateway-auth.db",
"PepperSecretName": "MxGateway:ApiKeyPepper",
"RunMigrationsOnStartup": true
},
"Worker": {
"ExecutablePath": "src\\MxGateway.Worker\\bin\\x86\\Release\\MxGateway.Worker.exe",
"WorkingDirectory": null,
"RequiredArchitecture": "X86",
"StartupTimeoutSeconds": 30,
"StartupProbeRetryAttempts": 3,
"StartupProbeRetryDelayMilliseconds": 250,
"PipeConnectAttemptTimeoutMilliseconds": 2000,
"ShutdownTimeoutSeconds": 10,
"HeartbeatIntervalSeconds": 5,
"HeartbeatGraceSeconds": 15,
"MaxMessageBytes": 16777216
},
"Sessions": {
"DefaultCommandTimeoutSeconds": 30,
"MaxSessions": 64,
"MaxPendingCommandsPerSession": 128,
"AllowMultipleEventSubscribers": false
},
"Events": {
"QueueCapacity": 10000,
"BackpressurePolicy": "FailFast"
},
"Dashboard": {
"Enabled": true,
"PathBase": "/dashboard",
"RequireAdminScope": true,
"AllowAnonymousLocalhost": true,
"SnapshotIntervalMilliseconds": 1000,
"RecentFaultLimit": 100,
"RecentSessionLimit": 200,
"ShowTagValues": false
},
"Protocol": {
"WorkerProtocolVersion": 1
}
}
}
```
Environment variables use the normal .NET double-underscore form. For example,
`MxGateway__Sessions__MaxSessions=20` overrides
`MxGateway:Sessions:MaxSessions`.
## Authentication Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Authentication:Mode` | `ApiKey` | Selects public gRPC authentication. Supported values are `ApiKey` and `Disabled`. `Disabled` bypasses API-key verification and is for local development only. |
| `MxGateway:Authentication:SqlitePath` | `C:\ProgramData\MxGateway\gateway-auth.db` | SQLite database path for API-key records and audit rows when API-key authentication is enabled. |
| `MxGateway:Authentication:PepperSecretName` | `MxGateway:ApiKeyPepper` | Configuration key used to read the HMAC pepper for API-key secret hashing. The dashboard effective configuration redacts this value. |
| `MxGateway:Authentication:RunMigrationsOnStartup` | `true` | Runs SQLite auth schema migrations at gateway startup when API-key authentication is enabled. |
When `Mode` is `ApiKey`, `SqlitePath` and `PepperSecretName` must be present.
`SqlitePath` must be a valid filesystem path.
## Worker Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Worker:ExecutablePath` | `src\MxGateway.Worker\bin\x86\Release\MxGateway.Worker.exe` | Path to the x86 worker executable launched for each gateway session. The path must be valid and point to a `.exe` file. |
| `MxGateway:Worker:WorkingDirectory` | `null` | Optional working directory for the worker process. When set, it must be a valid filesystem path. |
| `MxGateway:Worker:RequiredArchitecture` | `X86` | Required Portable Executable architecture for the worker. Supported values are `X86` and `X64`; MXAccess parity uses `X86`. |
| `MxGateway:Worker:StartupTimeoutSeconds` | `30` | Total startup budget for process launch, startup probe, pipe connect, handshake, and worker readiness. |
| `MxGateway:Worker:StartupProbeRetryAttempts` | `3` | Number of retry attempts for transient worker startup probe failures before pipe connection and handshake continue. |
| `MxGateway:Worker:StartupProbeRetryDelayMilliseconds` | `250` | Delay between transient startup probe retry attempts. |
| `MxGateway:Worker:PipeConnectAttemptTimeoutMilliseconds` | `2000` | Per-attempt timeout used by the worker named-pipe connect retry path. The overall pipe connection still stays under the startup budget. |
| `MxGateway:Worker:ShutdownTimeoutSeconds` | `10` | Grace period for worker shutdown before the gateway treats shutdown as failed and may kill the worker process tree. |
| `MxGateway:Worker:HeartbeatIntervalSeconds` | `5` | Worker heartbeat send interval and gateway heartbeat check cadence input. |
| `MxGateway:Worker:HeartbeatGraceSeconds` | `15` | Maximum age of the last worker heartbeat before the gateway faults the worker. This must be greater than or equal to `HeartbeatIntervalSeconds`. |
| `MxGateway:Worker:MaxMessageBytes` | `16777216` | Maximum worker IPC frame payload size in bytes. The validator allows values from `1024` through `268435456`. |
`StartupProbeRetryAttempts`, `StartupProbeRetryDelayMilliseconds`,
`PipeConnectAttemptTimeoutMilliseconds`, timeout values, heartbeat values, and
`MaxMessageBytes` must be positive. `MaxMessageBytes` is intentionally bounded
to avoid accidental large allocations from malformed or oversized frames.
## Session Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Sessions:DefaultCommandTimeoutSeconds` | `30` | Default timeout used while the gateway waits for a worker command reply when an open-session request does not provide a positive command timeout. |
| `MxGateway:Sessions:MaxSessions` | `64` | Maximum number of concurrently open gateway sessions. Session opens reserve a slot atomically before worker creation. |
| `MxGateway:Sessions:MaxPendingCommandsPerSession` | `128` | Maximum number of pending worker commands for one session. Excess commands fail fast instead of queueing indefinitely. |
| `MxGateway:Sessions:AllowMultipleEventSubscribers` | `false` | Controls whether multiple `StreamEvents` subscribers may attach to one session. `true` is rejected until event fan-out is implemented. |
All numeric session options must be greater than zero. The current event stream
implementation supports one active subscriber per session; this preserves event
ordering and avoids competing consumers.
## Event Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Events:QueueCapacity` | `10000` | Capacity for bounded per-session event queues used by the gateway worker event channel and the public gRPC event stream queue. |
| `MxGateway:Events:BackpressurePolicy` | `FailFast` | Event backpressure behavior. `FailFast` is the only supported value. |
`QueueCapacity` must be greater than zero. With `FailFast`, queue overflow
faults the affected worker or session instead of silently dropping MXAccess
events.
## Dashboard Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Dashboard:Enabled` | `true` | Enables Blazor Server dashboard route mapping. |
| `MxGateway:Dashboard:PathBase` | `/dashboard` | Base path for dashboard routes. When the dashboard is enabled, this value is required and must start with `/`. |
| `MxGateway:Dashboard:RequireAdminScope` | `true` | Requires API keys used for dashboard login to carry the `admin` scope. |
| `MxGateway:Dashboard:AllowAnonymousLocalhost` | `true` | Allows loopback dashboard requests to bypass the dashboard cookie requirement for local development. Remote requests still require dashboard authentication. |
| `MxGateway:Dashboard:SnapshotIntervalMilliseconds` | `1000` | Dashboard snapshot refresh interval used by realtime Blazor pages. |
| `MxGateway:Dashboard:RecentFaultLimit` | `100` | Maximum number of fault summaries projected into each dashboard snapshot. |
| `MxGateway:Dashboard:RecentSessionLimit` | `200` | Maximum number of session summaries projected into each dashboard snapshot. |
| `MxGateway:Dashboard:ShowTagValues` | `false` | Reserved display control for tag values. The dashboard does not show full tag values by default. |
`SnapshotIntervalMilliseconds` must be greater than zero. `RecentFaultLimit`
and `RecentSessionLimit` must be greater than or equal to zero.
## Protocol Options
| Option | Default | Description |
|--------|---------|-------------|
| `MxGateway:Protocol:WorkerProtocolVersion` | `1` | Worker IPC protocol version expected by the gateway and worker. This must match `GatewayContractInfo.WorkerProtocolVersion`. |
The protocol option is exposed for diagnostics and explicit deployment
configuration, not for compatibility negotiation. A mismatch fails validation
at startup.
## Related Documentation
- [Gateway Process Detailed Design](./gateway-process-design.md)
- [Gateway Dashboard Detailed Design](./gateway-dashboard-design.md)
- [Worker Process Launcher](./WorkerProcessLauncher.md)
- [Worker Frame Protocol](./WorkerFrameProtocol.md)
+4
View File
@@ -247,6 +247,10 @@ Each client should expose event streaming as the idiomatic streaming primitive:
Events must preserve gateway order. Libraries should not reorder, coalesce, or Events must preserve gateway order. Libraries should not reorder, coalesce, or
drop events by default. drop events by default.
Long-lived event streams do not inherit unary call deadlines. Clients apply the
default call timeout to unary operations only, and streams run until the caller
cancels them or an explicit stream timeout is configured.
The event surface must include: The event surface must include:
- `OnDataChange` - `OnDataChange`
+3
View File
@@ -336,6 +336,9 @@ Recommended visual language:
If charts are added later, prefer simple server-generated data tables first. Do If charts are added later, prefer simple server-generated data tables first. Do
not add a JavaScript charting dependency without a specific need. not add a JavaScript charting dependency without a specific need.
The reusable visual rules for replicating this interface in other projects are
documented in [Dashboard Interface Design](./DashboardInterfaceDesign.md).
## Testing ## Testing
Dashboard unit/component tests should cover: Dashboard unit/component tests should cover:
+21 -7
View File
@@ -361,13 +361,13 @@ worker startup, and removes the session if startup fails. A successful
`OpenSession` attaches the ready `IWorkerClient` and transitions the session to `OpenSession` attaches the ready `IWorkerClient` and transitions the session to
`Ready`. `Ready`.
Only `Ready` sessions accept command and event operations. `CloseSession` is Only `Ready` sessions accept command and event operations. `CloseSession` shuts
idempotent for sessions still known to the registry: the first close shuts down down the worker, disposes the worker client, and removes the session from the
the worker, and later closes return the final `Closed` state. Lease handling is registry so closed sessions do not retain pipe or process handles. A later close
exposed as a session hook so a monitor can close expired sessions without for the same id returns `SessionNotFound`. Lease handling is exposed as a
embedding lease policy in the worker client. Gateway shutdown walks the session hook so a monitor can close expired sessions without embedding lease
registry, closes each known session, and kills a worker if graceful shutdown policy in the worker client. Gateway shutdown walks the registry, closes each
fails. known session, and kills a worker if graceful shutdown fails.
## Worker Launch ## Worker Launch
@@ -813,6 +813,11 @@ It emits .NET `Meter` instruments for collectors and keeps a
the dashboard needs current counters and queue depths without depending on a the dashboard needs current counters and queue depths without depending on a
specific metrics exporter. specific metrics exporter.
Event metrics use low-cardinality tags such as event family. Per-session event
counts are kept only in the in-process snapshot for active dashboard sessions
and are purged when the session is removed. Worker event queue depth and gRPC
event stream queue depth are reported as separate gauges.
HTTP request handling uses `UseGatewayRequestLoggingScope()` to attach common HTTP request handling uses `UseGatewayRequestLoggingScope()` to attach common
structured log fields when request metadata is present: structured log fields when request metadata is present:
@@ -842,6 +847,8 @@ Suggested configuration shape:
}, },
"Worker": { "Worker": {
"ExecutablePath": "src/MxGateway.Worker/bin/x86/Release/MxGateway.Worker.exe", "ExecutablePath": "src/MxGateway.Worker/bin/x86/Release/MxGateway.Worker.exe",
"WorkingDirectory": null,
"RequiredArchitecture": "X86",
"StartupTimeoutSeconds": 30, "StartupTimeoutSeconds": 30,
"StartupProbeRetryAttempts": 3, "StartupProbeRetryAttempts": 3,
"StartupProbeRetryDelayMilliseconds": 250, "StartupProbeRetryDelayMilliseconds": 250,
@@ -854,6 +861,7 @@ Suggested configuration shape:
"Sessions": { "Sessions": {
"DefaultCommandTimeoutSeconds": 30, "DefaultCommandTimeoutSeconds": 30,
"MaxSessions": 64, "MaxSessions": 64,
"MaxPendingCommandsPerSession": 128,
"AllowMultipleEventSubscribers": false "AllowMultipleEventSubscribers": false
}, },
"Events": { "Events": {
@@ -869,6 +877,9 @@ Suggested configuration shape:
"RecentFaultLimit": 100, "RecentFaultLimit": 100,
"RecentSessionLimit": 200, "RecentSessionLimit": 200,
"ShowTagValues": false "ShowTagValues": false
},
"Protocol": {
"WorkerProtocolVersion": 1
} }
} }
} }
@@ -888,6 +899,9 @@ diagnostics, so it redacts secret-related fields such as
`Authentication:PepperSecretName` and does not include raw API keys or key `Authentication:PepperSecretName` and does not include raw API keys or key
material. material.
The complete option reference, including defaults and validation rules, is in
[Gateway Configuration](./GatewayConfiguration.md).
## Galaxy Repository Metadata ## Galaxy Repository Metadata
Galaxy hierarchy and tag metadata can be discovered through SQL Server when Galaxy hierarchy and tag metadata can be discovered through SQL Server when
@@ -0,0 +1,177 @@
[CmdletBinding()]
param(
[string]$Endpoint = "http://127.0.0.1:5001",
[string]$ApiKeyEnv = "MXGATEWAY_API_KEY",
[string]$ApiKey,
[int]$ClientCount = 5,
[int]$MachineStart = 1,
[int]$MachineEnd = 20,
[string]$Attribute = "TestChangingInt",
[int]$MaxEvents = [int]::MaxValue,
[int]$StreamCallTimeoutSeconds = 86400,
[string]$LogDirectory = (Join-Path (Get-Location) "artifacts\rust-testchangingint-subscribers")
)
Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"
if ($ClientCount -le 0) {
throw "ClientCount must be greater than zero."
}
if ($MachineStart -lt 1 -or $MachineEnd -lt $MachineStart) {
throw "MachineStart must be at least 1 and MachineEnd must be greater than or equal to MachineStart."
}
if ($StreamCallTimeoutSeconds -le 0) {
throw "StreamCallTimeoutSeconds must be greater than zero."
}
$repoRoot = Split-Path -Parent $PSScriptRoot
$rustRoot = Join-Path $repoRoot "clients\rust"
$mxgwExe = Join-Path $rustRoot "target\debug\mxgw.exe"
$sessionIds = New-Object System.Collections.Generic.List[string]
$streamProcesses = New-Object System.Collections.Generic.List[System.Diagnostics.Process]
function Get-ConnectionArgs {
$args = @("--endpoint", $Endpoint, "--plaintext")
if (-not [string]::IsNullOrWhiteSpace($ApiKey)) {
$args += @("--api-key", $ApiKey)
} else {
$args += @("--api-key-env", $ApiKeyEnv)
}
return $args
}
function Invoke-MxgwJson {
param(
[Parameter(Mandatory = $true)]
[string[]]$Arguments
)
$output = & $mxgwExe @Arguments 2>&1
if ($LASTEXITCODE -ne 0) {
throw "mxgw failed with exit code $LASTEXITCODE for arguments: $($Arguments -join ' ')`n$output"
}
$jsonText = ($output | Where-Object { -not [string]::IsNullOrWhiteSpace($_) }) -join "`n"
if ([string]::IsNullOrWhiteSpace($jsonText)) {
throw "mxgw returned no JSON for arguments: $($Arguments -join ' ')"
}
return $jsonText | ConvertFrom-Json
}
function Close-SessionQuietly {
param([string]$SessionId)
if ([string]::IsNullOrWhiteSpace($SessionId)) {
return
}
try {
$args = @("close-session") + (Get-ConnectionArgs) + @("--session-id", $SessionId, "--json")
[void](Invoke-MxgwJson -Arguments $args)
Write-Host "Closed session $SessionId"
} catch {
Write-Warning "Failed to close session ${SessionId}: $($_.Exception.Message)"
}
}
function Stop-StreamProcessQuietly {
param([System.Diagnostics.Process]$Process)
if ($null -eq $Process -or $Process.HasExited) {
return
}
try {
Stop-Process -Id $Process.Id -Force -ErrorAction Stop
Write-Host "Stopped stream process $($Process.Id)"
} catch {
Write-Warning "Failed to stop stream process $($Process.Id): $($_.Exception.Message)"
}
}
New-Item -ItemType Directory -Force -Path $LogDirectory | Out-Null
Write-Host "Building Rust CLI..."
Push-Location $rustRoot
try {
cargo build -p mxgw-cli
} finally {
Pop-Location
}
if (-not (Test-Path -LiteralPath $mxgwExe)) {
throw "Rust CLI executable was not found at $mxgwExe"
}
$tags = for ($machine = $MachineStart; $machine -le $MachineEnd; $machine++) {
"TestMachine_{0:D3}.{1}" -f $machine, $Attribute
}
try {
for ($clientIndex = 0; $clientIndex -lt $ClientCount; $clientIndex++) {
$clientNumber = $clientIndex + 1
$clientTags = for ($tagIndex = $clientIndex; $tagIndex -lt $tags.Count; $tagIndex += $ClientCount) {
$tags[$tagIndex]
}
if ($clientTags.Count -eq 0) {
continue
}
$clientName = "mxgw-rust-changingint-$clientNumber"
Write-Host "Opening session for client $clientNumber with $($clientTags.Count) tag(s)..."
$openArgs = @("open-session") + (Get-ConnectionArgs) + @("--client-name", $clientName, "--json")
$open = Invoke-MxgwJson -Arguments $openArgs
$sessionId = [string]$open.sessionId
$sessionIds.Add($sessionId)
$registerArgs = @("register") + (Get-ConnectionArgs) + @("--session-id", $sessionId, "--client-name", $clientName, "--json")
$register = Invoke-MxgwJson -Arguments $registerArgs
$serverHandle = [int]$register.serverHandle
foreach ($tag in $clientTags) {
$addArgs = @("add-item") + (Get-ConnectionArgs) + @("--session-id", $sessionId, "--server-handle", $serverHandle, "--item", $tag, "--json")
$add = Invoke-MxgwJson -Arguments $addArgs
$itemHandle = [int]$add.itemHandle
$adviseArgs = @("advise") + (Get-ConnectionArgs) + @("--session-id", $sessionId, "--server-handle", $serverHandle, "--item-handle", $itemHandle, "--json")
[void](Invoke-MxgwJson -Arguments $adviseArgs)
Write-Host "Client $clientNumber subscribed $tag itemHandle=$itemHandle"
}
$stdout = Join-Path $LogDirectory ("client-{0:D2}.stdout.log" -f $clientNumber)
$stderr = Join-Path $LogDirectory ("client-{0:D2}.stderr.log" -f $clientNumber)
$streamArgs = @("stream-events") + (Get-ConnectionArgs) + @(
"--session-id", $sessionId,
"--max-events", $MaxEvents.ToString(),
"--call-timeout-seconds", $StreamCallTimeoutSeconds.ToString(),
"--json")
$process = Start-Process -FilePath $mxgwExe -ArgumentList $streamArgs -WorkingDirectory $rustRoot -WindowStyle Hidden -RedirectStandardOutput $stdout -RedirectStandardError $stderr -PassThru
$streamProcesses.Add($process)
Write-Host "Client $clientNumber streaming session $sessionId in process $($process.Id); logs: $stdout"
}
Write-Host "Started $($streamProcesses.Count) Rust stream client(s). Press Ctrl+C to stop and close sessions."
while ($true) {
Start-Sleep -Seconds 5
foreach ($process in @($streamProcesses)) {
if ($process.HasExited) {
throw "Stream process $($process.Id) exited with code $($process.ExitCode). Check $LogDirectory."
}
}
}
} finally {
Write-Host "Stopping Rust stream clients and closing gateway sessions..."
foreach ($process in @($streamProcesses)) {
Stop-StreamProcessQuietly -Process $process
}
foreach ($sessionId in @($sessionIds)) {
Close-SessionQuietly -SessionId $sessionId
}
}
@@ -125,6 +125,16 @@ public sealed class GatewayOptionsValidator : IValidateOptions<GatewayOptions>
"MxGateway:Sessions:DefaultCommandTimeoutSeconds must be greater than zero.", "MxGateway:Sessions:DefaultCommandTimeoutSeconds must be greater than zero.",
failures); failures);
AddIfNotPositive(options.MaxSessions, "MxGateway:Sessions:MaxSessions must be greater than zero.", failures); AddIfNotPositive(options.MaxSessions, "MxGateway:Sessions:MaxSessions must be greater than zero.", failures);
AddIfNotPositive(
options.MaxPendingCommandsPerSession,
"MxGateway:Sessions:MaxPendingCommandsPerSession must be greater than zero.",
failures);
if (options.AllowMultipleEventSubscribers)
{
failures.Add(
"MxGateway:Sessions:AllowMultipleEventSubscribers is not supported until event fan-out is implemented.");
}
} }
private static void ValidateEvents(EventOptions options, List<string> failures) private static void ValidateEvents(EventOptions options, List<string> failures)
@@ -6,5 +6,7 @@ public sealed class SessionOptions
public int MaxSessions { get; init; } = 64; public int MaxSessions { get; init; } = 64;
public int MaxPendingCommandsPerSession { get; init; } = 128;
public bool AllowMultipleEventSubscribers { get; init; } public bool AllowMultipleEventSubscribers { get; init; }
} }
@@ -21,6 +21,11 @@ public static class DashboardDisplay
return string.IsNullOrWhiteSpace(value) ? "-" : value; return string.IsNullOrWhiteSpace(value) ? "-" : value;
} }
public static string Count(long value)
{
return value.ToString("N0", System.Globalization.CultureInfo.InvariantCulture);
}
public static long MetricValue(DashboardSnapshot snapshot, string name, string? dimension = null) public static long MetricValue(DashboardSnapshot snapshot, string name, string? dimension = null)
{ {
return snapshot.Metrics.FirstOrDefault(metric => return snapshot.Metrics.FirstOrDefault(metric =>
@@ -20,13 +20,13 @@ else
<section class="metric-grid"> <section class="metric-grid">
<MetricCard Label="Uptime" Value="@DashboardDisplay.Duration(Snapshot.GatewayUptime)" Detail="@Snapshot.GatewayVersion" /> <MetricCard Label="Uptime" Value="@DashboardDisplay.Duration(Snapshot.GatewayUptime)" Detail="@Snapshot.GatewayVersion" />
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Open Sessions" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open"))" />
<MetricCard Label="Workers Running" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Workers Running" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running"))" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Commands Failed" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed"))" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Faults" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Faults" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
</section> </section>
<section class="dashboard-section"> <section class="dashboard-section">
@@ -18,10 +18,11 @@ else
</div> </div>
<section class="metric-grid compact"> <section class="metric-grid compact">
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Worker Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Stream Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.grpc_stream_queue.depth"))" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected").ToString(System.Globalization.CultureInfo.InvariantCulture)" /> <MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected"))" />
</section> </section>
<section class="dashboard-section"> <section class="dashboard-section">
@@ -47,7 +48,7 @@ else
{ {
<tr> <tr>
<td>@metric.Dimension</td> <td>@metric.Dimension</td>
<td>@metric.Value</td> <td>@DashboardDisplay.Count(metric.Value)</td>
</tr> </tr>
} }
</tbody> </tbody>
@@ -39,6 +39,7 @@ else
<tr><th scope="row">Opened</th><td>@DashboardDisplay.DateTime(CurrentSession.OpenedAt)</td></tr> <tr><th scope="row">Opened</th><td>@DashboardDisplay.DateTime(CurrentSession.OpenedAt)</td></tr>
<tr><th scope="row">Last activity</th><td>@DashboardDisplay.DateTime(CurrentSession.LastClientActivityAt)</td></tr> <tr><th scope="row">Last activity</th><td>@DashboardDisplay.DateTime(CurrentSession.LastClientActivityAt)</td></tr>
<tr><th scope="row">Lease expires</th><td>@DashboardDisplay.DateTime(CurrentSession.LeaseExpiresAt)</td></tr> <tr><th scope="row">Lease expires</th><td>@DashboardDisplay.DateTime(CurrentSession.LeaseExpiresAt)</td></tr>
<tr><th scope="row">Events received</th><td>@DashboardDisplay.Count(CurrentSession.EventsReceived)</td></tr>
<tr><th scope="row">Last fault</th><td>@DashboardDisplay.Text(CurrentSession.LastFault)</td></tr> <tr><th scope="row">Last fault</th><td>@DashboardDisplay.Text(CurrentSession.LastFault)</td></tr>
</tbody> </tbody>
</table> </table>
@@ -33,6 +33,7 @@ else
<th scope="col">Client</th> <th scope="col">Client</th>
<th scope="col">Backend</th> <th scope="col">Backend</th>
<th scope="col">Worker</th> <th scope="col">Worker</th>
<th scope="col">Events</th>
<th scope="col">Opened</th> <th scope="col">Opened</th>
<th scope="col">Activity</th> <th scope="col">Activity</th>
<th scope="col">Heartbeat</th> <th scope="col">Heartbeat</th>
@@ -54,6 +55,7 @@ else
<span class="ms-1"><StatusBadge Text="@session.WorkerState.ToString()" /></span> <span class="ms-1"><StatusBadge Text="@session.WorkerState.ToString()" /></span>
} }
</td> </td>
<td>@DashboardDisplay.Count(session.EventsReceived)</td>
<td>@DashboardDisplay.DateTime(session.OpenedAt)</td> <td>@DashboardDisplay.DateTime(session.OpenedAt)</td>
<td>@DashboardDisplay.DateTime(session.LastClientActivityAt)</td> <td>@DashboardDisplay.DateTime(session.LastClientActivityAt)</td>
<td>@DashboardDisplay.DateTime(session.LastWorkerHeartbeatAt)</td> <td>@DashboardDisplay.DateTime(session.LastWorkerHeartbeatAt)</td>
@@ -16,4 +16,5 @@ public sealed record DashboardSessionSummary(
int? WorkerProcessId, int? WorkerProcessId,
WorkerClientState? WorkerState, WorkerClientState? WorkerState,
DateTimeOffset? LastWorkerHeartbeatAt, DateTimeOffset? LastWorkerHeartbeatAt,
long EventsReceived,
string? LastFault); string? LastFault);
@@ -45,15 +45,15 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
IReadOnlyList<GatewaySession> sessions = _sessionRegistry.Snapshot() IReadOnlyList<GatewaySession> sessions = _sessionRegistry.Snapshot()
.OrderByDescending(session => session.OpenedAt) .OrderByDescending(session => session.OpenedAt)
.ToArray(); .ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
IReadOnlyList<DashboardSessionSummary> sessionSummaries = sessions IReadOnlyList<DashboardSessionSummary> sessionSummaries = sessions
.Take(ResolveLimit(_recentSessionLimit)) .Take(ResolveLimit(_recentSessionLimit))
.Select(CreateSessionSummary) .Select(session => CreateSessionSummary(session, metricsSnapshot))
.ToArray(); .ToArray();
IReadOnlyList<DashboardWorkerSummary> workerSummaries = sessions IReadOnlyList<DashboardWorkerSummary> workerSummaries = sessions
.Where(session => session.WorkerClient is not null) .Where(session => session.WorkerClient is { State: not WorkerClientState.Closed })
.Select(CreateWorkerSummary) .Select(CreateWorkerSummary)
.ToArray(); .ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
return new DashboardSnapshot( return new DashboardSnapshot(
GeneratedAt: generatedAt, GeneratedAt: generatedAt,
@@ -100,9 +100,12 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
} }
} }
private static DashboardSessionSummary CreateSessionSummary(GatewaySession session) private static DashboardSessionSummary CreateSessionSummary(
GatewaySession session,
GatewayMetricsSnapshot metricsSnapshot)
{ {
IWorkerClient? workerClient = session.WorkerClient; IWorkerClient? workerClient = session.WorkerClient;
metricsSnapshot.EventsBySession.TryGetValue(session.SessionId, out long eventsReceived);
return new DashboardSessionSummary( return new DashboardSessionSummary(
SessionId: session.SessionId, SessionId: session.SessionId,
@@ -117,6 +120,7 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
WorkerProcessId: workerClient?.ProcessId, WorkerProcessId: workerClient?.ProcessId,
WorkerState: workerClient?.State, WorkerState: workerClient?.State,
LastWorkerHeartbeatAt: workerClient?.LastHeartbeatAt, LastWorkerHeartbeatAt: workerClient?.LastHeartbeatAt,
EventsReceived: eventsReceived,
LastFault: DashboardRedactor.Redact(session.FinalFault)); LastFault: DashboardRedactor.Redact(session.FinalFault));
} }
@@ -138,7 +142,8 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
[ [
new("mxgateway.sessions.open", snapshot.OpenSessions), new("mxgateway.sessions.open", snapshot.OpenSessions),
new("mxgateway.workers.running", snapshot.WorkersRunning), new("mxgateway.workers.running", snapshot.WorkersRunning),
new("mxgateway.events.queue.depth", snapshot.EventQueueDepth), new("mxgateway.events.worker_queue.depth", snapshot.WorkerEventQueueDepth),
new("mxgateway.events.grpc_stream_queue.depth", snapshot.GrpcEventStreamQueueDepth),
new("mxgateway.sessions.opened", snapshot.SessionsOpened), new("mxgateway.sessions.opened", snapshot.SessionsOpened),
new("mxgateway.sessions.closed", snapshot.SessionsClosed), new("mxgateway.sessions.closed", snapshot.SessionsClosed),
new("mxgateway.commands.started", snapshot.CommandsStarted), new("mxgateway.commands.started", snapshot.CommandsStarted),
@@ -47,7 +47,7 @@ public sealed class EventStreamService(
() => () =>
{ {
int depth = Interlocked.Increment(ref streamQueueDepth); int depth = Interlocked.Increment(ref streamQueueDepth);
metrics.SetEventQueueDepth(depth); metrics.SetGrpcEventStreamQueueDepth(depth);
}, },
streamCts.Token); streamCts.Token);
@@ -56,7 +56,7 @@ public sealed class EventStreamService(
await foreach (MxEvent mxEvent in eventQueue.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false)) await foreach (MxEvent mxEvent in eventQueue.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
{ {
int depth = Math.Max(0, Interlocked.Decrement(ref streamQueueDepth)); int depth = Math.Max(0, Interlocked.Decrement(ref streamQueueDepth));
metrics.SetEventQueueDepth(depth); metrics.SetGrpcEventStreamQueueDepth(depth);
yield return mxEvent; yield return mxEvent;
} }
+46 -7
View File
@@ -26,11 +26,13 @@ public sealed class GatewayMetrics : IDisposable
private readonly Histogram<double> _eventStreamSendLatencyHistogram; private readonly Histogram<double> _eventStreamSendLatencyHistogram;
private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase); private readonly Dictionary<string, long> _commandFailuresByMethod = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase); private readonly Dictionary<string, long> _eventsByFamily = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, long> _eventsBySession = new(StringComparer.Ordinal);
private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase); private readonly Dictionary<string, long> _retryAttemptsByArea = new(StringComparer.OrdinalIgnoreCase);
private int _openSessions; private int _openSessions;
private int _workersRunning; private int _workersRunning;
private int _eventQueueDepth; private int _workerEventQueueDepth;
private int _grpcEventStreamQueueDepth;
private long _sessionsOpened; private long _sessionsOpened;
private long _sessionsClosed; private long _sessionsClosed;
private long _commandsStarted; private long _commandsStarted;
@@ -68,7 +70,8 @@ public sealed class GatewayMetrics : IDisposable
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions); _meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning); _meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
_meter.CreateObservableGauge("mxgateway.events.queue.depth", GetEventQueueDepth); _meter.CreateObservableGauge("mxgateway.events.worker_queue.depth", GetWorkerEventQueueDepth);
_meter.CreateObservableGauge("mxgateway.events.grpc_stream_queue.depth", GetGrpcEventStreamQueueDepth);
} }
public void SessionOpened() public void SessionOpened()
@@ -174,11 +177,11 @@ public sealed class GatewayMetrics : IDisposable
{ {
_eventsReceived++; _eventsReceived++;
Increment(_eventsByFamily, family); Increment(_eventsByFamily, family);
Increment(_eventsBySession, sessionId);
} }
_eventsReceivedCounter.Add( _eventsReceivedCounter.Add(
1, 1,
new KeyValuePair<string, object?>("session_id", sessionId),
new KeyValuePair<string, object?>("family", family)); new KeyValuePair<string, object?>("family", family));
} }
@@ -190,6 +193,11 @@ public sealed class GatewayMetrics : IDisposable
} }
public void SetEventQueueDepth(int depth) public void SetEventQueueDepth(int depth)
{
SetWorkerEventQueueDepth(depth);
}
public void SetWorkerEventQueueDepth(int depth)
{ {
if (depth < 0) if (depth < 0)
{ {
@@ -198,7 +206,28 @@ public sealed class GatewayMetrics : IDisposable
lock (_syncRoot) lock (_syncRoot)
{ {
_eventQueueDepth = depth; _workerEventQueueDepth = depth;
}
}
public void SetGrpcEventStreamQueueDepth(int depth)
{
if (depth < 0)
{
throw new ArgumentOutOfRangeException(nameof(depth), depth, "Queue depth cannot be negative.");
}
lock (_syncRoot)
{
_grpcEventStreamQueueDepth = depth;
}
}
public void RemoveSessionEvents(string sessionId)
{
lock (_syncRoot)
{
_eventsBySession.Remove(sessionId);
} }
} }
@@ -260,7 +289,8 @@ public sealed class GatewayMetrics : IDisposable
return new GatewayMetricsSnapshot( return new GatewayMetricsSnapshot(
OpenSessions: _openSessions, OpenSessions: _openSessions,
WorkersRunning: _workersRunning, WorkersRunning: _workersRunning,
EventQueueDepth: _eventQueueDepth, WorkerEventQueueDepth: _workerEventQueueDepth,
GrpcEventStreamQueueDepth: _grpcEventStreamQueueDepth,
SessionsOpened: _sessionsOpened, SessionsOpened: _sessionsOpened,
SessionsClosed: _sessionsClosed, SessionsClosed: _sessionsClosed,
CommandsStarted: _commandsStarted, CommandsStarted: _commandsStarted,
@@ -276,6 +306,7 @@ public sealed class GatewayMetrics : IDisposable
RetryAttempts: _retryAttempts, RetryAttempts: _retryAttempts,
CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase), CommandFailuresByMethod: new Dictionary<string, long>(_commandFailuresByMethod, StringComparer.OrdinalIgnoreCase),
EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase), EventsByFamily: new Dictionary<string, long>(_eventsByFamily, StringComparer.OrdinalIgnoreCase),
EventsBySession: new Dictionary<string, long>(_eventsBySession, StringComparer.Ordinal),
RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase)); RetryAttemptsByArea: new Dictionary<string, long>(_retryAttemptsByArea, StringComparer.OrdinalIgnoreCase));
} }
} }
@@ -307,11 +338,19 @@ public sealed class GatewayMetrics : IDisposable
} }
} }
private int GetEventQueueDepth() private int GetWorkerEventQueueDepth()
{ {
lock (_syncRoot) lock (_syncRoot)
{ {
return _eventQueueDepth; return _workerEventQueueDepth;
}
}
private int GetGrpcEventStreamQueueDepth()
{
lock (_syncRoot)
{
return _grpcEventStreamQueueDepth;
} }
} }
@@ -3,7 +3,8 @@ namespace MxGateway.Server.Metrics;
public sealed record GatewayMetricsSnapshot( public sealed record GatewayMetricsSnapshot(
int OpenSessions, int OpenSessions,
int WorkersRunning, int WorkersRunning,
int EventQueueDepth, int WorkerEventQueueDepth,
int GrpcEventStreamQueueDepth,
long SessionsOpened, long SessionsOpened,
long SessionsClosed, long SessionsClosed,
long CommandsStarted, long CommandsStarted,
@@ -19,4 +20,5 @@ public sealed record GatewayMetricsSnapshot(
long RetryAttempts, long RetryAttempts,
IReadOnlyDictionary<string, long> CommandFailuresByMethod, IReadOnlyDictionary<string, long> CommandFailuresByMethod,
IReadOnlyDictionary<string, long> EventsByFamily, IReadOnlyDictionary<string, long> EventsByFamily,
IReadOnlyDictionary<string, long> EventsBySession,
IReadOnlyDictionary<string, long> RetryAttemptsByArea); IReadOnlyDictionary<string, long> RetryAttemptsByArea);
+60 -15
View File
@@ -23,6 +23,7 @@ public sealed class SessionManager : ISessionManager
private readonly TimeProvider _timeProvider; private readonly TimeProvider _timeProvider;
private readonly ILogger<SessionManager> _logger; private readonly ILogger<SessionManager> _logger;
private readonly GatewayOptions _options; private readonly GatewayOptions _options;
private readonly SemaphoreSlim _sessionSlots;
public SessionManager( public SessionManager(
ISessionRegistry registry, ISessionRegistry registry,
@@ -39,6 +40,7 @@ public sealed class SessionManager : ISessionManager
_timeProvider = timeProvider ?? TimeProvider.System; _timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? NullLogger<SessionManager>.Instance; _logger = logger ?? NullLogger<SessionManager>.Instance;
_options = options.Value; _options = options.Value;
_sessionSlots = new SemaphoreSlim(_options.Sessions.MaxSessions, _options.Sessions.MaxSessions);
} }
public async Task<GatewaySession> OpenSessionAsync( public async Task<GatewaySession> OpenSessionAsync(
@@ -49,16 +51,17 @@ public sealed class SessionManager : ISessionManager
ArgumentNullException.ThrowIfNull(request); ArgumentNullException.ThrowIfNull(request);
EnsureSessionCapacity(); EnsureSessionCapacity();
GatewaySession session = CreateSession(request, clientIdentity); GatewaySession? session = null;
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
try try
{ {
session = CreateSession(request, clientIdentity);
if (!_registry.TryAdd(session))
{
throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed,
$"Session id collision while opening session {session.SessionId}.");
}
session.TransitionTo(SessionState.StartingWorker); session.TransitionTo(SessionState.StartingWorker);
IWorkerClient workerClient = await _workerClientFactory IWorkerClient workerClient = await _workerClientFactory
.CreateAsync(session, cancellationToken) .CreateAsync(session, cancellationToken)
@@ -72,18 +75,23 @@ public sealed class SessionManager : ISessionManager
} }
catch (Exception exception) catch (Exception exception)
{ {
session.MarkFaulted(exception.Message); session?.MarkFaulted(exception.Message);
_registry.TryRemove(session.SessionId, out _); if (session is not null)
await session.DisposeAsync().ConfigureAwait(false); {
_registry.TryRemove(session.SessionId, out _);
await session.DisposeAsync().ConfigureAwait(false);
}
ReleaseSessionSlot();
_metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString()); _metrics.Fault(SessionManagerErrorCode.OpenFailed.ToString());
_logger.LogWarning( _logger.LogWarning(
exception, exception,
"Failed to open gateway session {SessionId}.", "Failed to open gateway session {SessionId}.",
session.SessionId); session?.SessionId ?? "<not-created>");
throw new SessionManagerException( throw new SessionManagerException(
SessionManagerErrorCode.OpenFailed, SessionManagerErrorCode.OpenFailed,
$"Failed to open session {session.SessionId}.", session is null ? "Failed to create session." : $"Failed to open session {session.SessionId}.",
exception); exception);
} }
} }
@@ -177,6 +185,7 @@ public sealed class SessionManager : ISessionManager
"Graceful shutdown failed for session {SessionId}; killing worker.", "Graceful shutdown failed for session {SessionId}; killing worker.",
session.SessionId); session.SessionId);
session.KillWorker(GatewayShutdownReason); session.KillWorker(GatewayShutdownReason);
await RemoveSessionAsync(session).ConfigureAwait(false);
} }
} }
} }
@@ -195,6 +204,7 @@ public sealed class SessionManager : ISessionManager
_metrics.SessionClosed(); _metrics.SessionClosed();
} }
await RemoveSessionAsync(session).ConfigureAwait(false);
return result; return result;
} }
catch (Exception exception) catch (Exception exception)
@@ -222,7 +232,7 @@ public sealed class SessionManager : ISessionManager
private void EnsureSessionCapacity() private void EnsureSessionCapacity()
{ {
if (_registry.ActiveCount >= _options.Sessions.MaxSessions) if (!_sessionSlots.Wait(0))
{ {
throw new SessionManagerException( throw new SessionManagerException(
SessionManagerErrorCode.SessionLimitExceeded, SessionManagerErrorCode.SessionLimitExceeded,
@@ -230,6 +240,29 @@ public sealed class SessionManager : ISessionManager
} }
} }
private async Task RemoveSessionAsync(GatewaySession session)
{
if (!_registry.TryRemove(session.SessionId, out GatewaySession? removedSession))
{
return;
}
_metrics.RemoveSessionEvents(session.SessionId);
ReleaseSessionSlot();
await removedSession.DisposeAsync().ConfigureAwait(false);
}
private void ReleaseSessionSlot()
{
try
{
_sessionSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private GatewaySession CreateSession( private GatewaySession CreateSession(
SessionOpenRequest request, SessionOpenRequest request,
string? clientIdentity) string? clientIdentity)
@@ -244,6 +277,7 @@ public sealed class SessionManager : ISessionManager
string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}"; string pipeName = $"mxaccess-gateway-{Environment.ProcessId}-{sessionId}";
string nonce = CreateNonce(); string nonce = CreateNonce();
DateTimeOffset openedAt = _timeProvider.GetUtcNow(); DateTimeOffset openedAt = _timeProvider.GetUtcNow();
string clientCorrelationId = CreateClientCorrelationId(request.ClientSessionName, sessionId);
return new GatewaySession( return new GatewaySession(
sessionId, sessionId,
@@ -252,13 +286,24 @@ public sealed class SessionManager : ISessionManager
nonce, nonce,
clientIdentity, clientIdentity,
request.ClientSessionName, request.ClientSessionName,
request.ClientCorrelationId, clientCorrelationId,
commandTimeout, commandTimeout,
startupTimeout, startupTimeout,
shutdownTimeout, shutdownTimeout,
openedAt); openedAt);
} }
private static string CreateClientCorrelationId(
string? clientSessionName,
string sessionId)
{
string clientName = string.IsNullOrWhiteSpace(clientSessionName)
? "client"
: clientSessionName!;
return $"{clientName}-{sessionId}";
}
private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout) private TimeSpan ResolveCommandTimeout(Duration? requestedTimeout)
{ {
if (requestedTimeout is null) if (requestedTimeout is null)
@@ -7,6 +7,7 @@ public static class SessionServiceCollectionExtensions
services.AddSingleton<ISessionRegistry, SessionRegistry>(); services.AddSingleton<ISessionRegistry, SessionRegistry>();
services.AddSingleton<ISessionWorkerClientFactory, SessionWorkerClientFactory>(); services.AddSingleton<ISessionWorkerClientFactory, SessionWorkerClientFactory>();
services.AddSingleton<ISessionManager, SessionManager>(); services.AddSingleton<ISessionManager, SessionManager>();
services.AddHostedService<SessionShutdownHostedService>();
return services; return services;
} }
@@ -0,0 +1,26 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace MxGateway.Server.Sessions;
public sealed class SessionShutdownHostedService(
ISessionManager sessionManager,
ILogger<SessionShutdownHostedService> logger) : IHostedService
{
public Task StartAsync(CancellationToken cancellationToken)
{
return Task.CompletedTask;
}
public async Task StopAsync(CancellationToken cancellationToken)
{
try
{
await sessionManager.ShutdownAsync(cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
logger.LogWarning("Gateway session shutdown was canceled by host shutdown timeout.");
}
}
}
@@ -74,6 +74,7 @@ public sealed class SessionWorkerClientFactory : ISessionWorkerClientFactory
HeartbeatGrace = TimeSpan.FromSeconds(_options.Worker.HeartbeatGraceSeconds), HeartbeatGrace = TimeSpan.FromSeconds(_options.Worker.HeartbeatGraceSeconds),
HeartbeatCheckInterval = TimeSpan.FromSeconds(_options.Worker.HeartbeatIntervalSeconds), HeartbeatCheckInterval = TimeSpan.FromSeconds(_options.Worker.HeartbeatIntervalSeconds),
EventChannelCapacity = _options.Events.QueueCapacity, EventChannelCapacity = _options.Events.QueueCapacity,
MaxPendingCommands = _options.Sessions.MaxPendingCommandsPerSession,
}; };
workerClient = new WorkerClient( workerClient = new WorkerClient(
+55 -6
View File
@@ -24,6 +24,7 @@ public sealed class WorkerClient : IWorkerClient
private readonly Channel<WorkerEnvelope> _outboundEnvelopes; private readonly Channel<WorkerEnvelope> _outboundEnvelopes;
private readonly Channel<WorkerEvent> _events; private readonly Channel<WorkerEvent> _events;
private readonly ConcurrentDictionary<string, PendingCommand> _pendingCommands = new(StringComparer.Ordinal); private readonly ConcurrentDictionary<string, PendingCommand> _pendingCommands = new(StringComparer.Ordinal);
private readonly SemaphoreSlim _pendingCommandSlots;
private readonly CancellationTokenSource _stopCts = new(); private readonly CancellationTokenSource _stopCts = new();
private long _nextSequence; private long _nextSequence;
private WorkerClientState _state; private WorkerClientState _state;
@@ -33,6 +34,7 @@ public sealed class WorkerClient : IWorkerClient
private Task? _readLoopTask; private Task? _readLoopTask;
private Task? _writeLoopTask; private Task? _writeLoopTask;
private Task? _heartbeatLoopTask; private Task? _heartbeatLoopTask;
private bool _workerStartRecorded;
private bool _disposed; private bool _disposed;
public WorkerClient( public WorkerClient(
@@ -49,11 +51,13 @@ public sealed class WorkerClient : IWorkerClient
_logger = logger ?? NullLogger<WorkerClient>.Instance; _logger = logger ?? NullLogger<WorkerClient>.Instance;
_reader = new WorkerFrameReader(connection.Stream, connection.FrameOptions); _reader = new WorkerFrameReader(connection.Stream, connection.FrameOptions);
_writer = new WorkerFrameWriter(connection.Stream, connection.FrameOptions); _writer = new WorkerFrameWriter(connection.Stream, connection.FrameOptions);
_outboundEnvelopes = Channel.CreateUnbounded<WorkerEnvelope>( _pendingCommandSlots = new SemaphoreSlim(_options.MaxPendingCommands, _options.MaxPendingCommands);
new UnboundedChannelOptions _outboundEnvelopes = Channel.CreateBounded<WorkerEnvelope>(
new BoundedChannelOptions(_options.MaxPendingCommands + 4)
{ {
SingleReader = true, SingleReader = true,
SingleWriter = false, SingleWriter = false,
FullMode = BoundedChannelFullMode.Wait,
AllowSynchronousContinuations = false, AllowSynchronousContinuations = false,
}); });
_events = Channel.CreateBounded<WorkerEvent>( _events = Channel.CreateBounded<WorkerEvent>(
@@ -140,6 +144,14 @@ public sealed class WorkerClient : IWorkerClient
string correlationId = Guid.NewGuid().ToString("N"); string correlationId = Guid.NewGuid().ToString("N");
string method = GetCommandMethod(command); string method = GetCommandMethod(command);
if (!_pendingCommandSlots.Wait(0))
{
_metrics?.QueueOverflow("worker-pending-commands");
throw new WorkerClientException(
WorkerClientErrorCode.PendingCommandLimitExceeded,
$"Worker session {SessionId} already has {_options.MaxPendingCommands} pending command(s).");
}
PendingCommand pendingCommand = new( PendingCommand pendingCommand = new(
correlationId, correlationId,
method, method,
@@ -147,6 +159,7 @@ public sealed class WorkerClient : IWorkerClient
if (!_pendingCommands.TryAdd(correlationId, pendingCommand)) if (!_pendingCommands.TryAdd(correlationId, pendingCommand))
{ {
ReleasePendingCommandSlot();
throw new InvalidOperationException("Generated a duplicate command correlation id."); throw new InvalidOperationException("Generated a duplicate command correlation id.");
} }
@@ -188,7 +201,11 @@ public sealed class WorkerClient : IWorkerClient
} }
catch catch
{ {
_pendingCommands.TryRemove(correlationId, out _); if (_pendingCommands.TryRemove(correlationId, out _))
{
ReleasePendingCommandSlot();
}
throw; throw;
} }
} }
@@ -199,7 +216,7 @@ public sealed class WorkerClient : IWorkerClient
await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false)) await foreach (WorkerEvent workerEvent in _events.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
{ {
int queueDepth = Math.Max(0, Interlocked.Decrement(ref _eventQueueDepth)); int queueDepth = Math.Max(0, Interlocked.Decrement(ref _eventQueueDepth));
_metrics?.SetEventQueueDepth(queueDepth); _metrics?.SetWorkerEventQueueDepth(queueDepth);
yield return workerEvent; yield return workerEvent;
} }
} }
@@ -272,6 +289,7 @@ public sealed class WorkerClient : IWorkerClient
await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false); await WaitForBackgroundTasksAsync(CancellationToken.None).ConfigureAwait(false);
await _connection.Stream.DisposeAsync().ConfigureAwait(false); await _connection.Stream.DisposeAsync().ConfigureAwait(false);
_connection.ProcessHandle?.Dispose(); _connection.ProcessHandle?.Dispose();
_pendingCommandSlots.Dispose();
_stopCts.Dispose(); _stopCts.Dispose();
} }
@@ -409,7 +427,7 @@ public sealed class WorkerClient : IWorkerClient
} }
int queueDepth = Interlocked.Increment(ref _eventQueueDepth); int queueDepth = Interlocked.Increment(ref _eventQueueDepth);
_metrics?.SetEventQueueDepth(queueDepth); _metrics?.SetWorkerEventQueueDepth(queueDepth);
} }
private void CompleteCommand(WorkerEnvelope envelope) private void CompleteCommand(WorkerEnvelope envelope)
@@ -429,6 +447,7 @@ public sealed class WorkerClient : IWorkerClient
return; return;
} }
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp); TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandSucceeded(pendingCommand.Method, duration); _metrics?.CommandSucceeded(pendingCommand.Method, duration);
pendingCommand.SetResult(envelope.WorkerCommandReply); pendingCommand.SetResult(envelope.WorkerCommandReply);
@@ -445,6 +464,7 @@ public sealed class WorkerClient : IWorkerClient
return; return;
} }
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp); TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandFailed(pendingCommand.Method, errorCode.ToString(), duration); _metrics?.CommandFailed(pendingCommand.Method, errorCode.ToString(), duration);
pendingCommand.SetException(new WorkerClientException(errorCode, message)); pendingCommand.SetException(new WorkerClientException(errorCode, message));
@@ -498,6 +518,7 @@ public sealed class WorkerClient : IWorkerClient
: ready.WorkerProcessId; : ready.WorkerProcessId;
_lastHeartbeatAt = _timeProvider.GetUtcNow(); _lastHeartbeatAt = _timeProvider.GetUtcNow();
_state = WorkerClientState.Ready; _state = WorkerClientState.Ready;
_workerStartRecorded = true;
} }
DateTimeOffset readyAt = _timeProvider.GetUtcNow(); DateTimeOffset readyAt = _timeProvider.GetUtcNow();
@@ -549,7 +570,7 @@ public sealed class WorkerClient : IWorkerClient
new WorkerClientException( new WorkerClientException(
WorkerClientErrorCode.GatewayShutdown, WorkerClientErrorCode.GatewayShutdown,
$"Worker client closed because {reason}.")); $"Worker client closed because {reason}."));
_metrics?.WorkerStopped(reason); RecordWorkerStoppedOnce(reason);
} }
private void SetFaulted( private void SetFaulted(
@@ -575,16 +596,33 @@ public sealed class WorkerClient : IWorkerClient
_outboundEnvelopes.Writer.TryComplete(fault); _outboundEnvelopes.Writer.TryComplete(fault);
_events.Writer.TryComplete(fault); _events.Writer.TryComplete(fault);
CompletePendingCommands(fault); CompletePendingCommands(fault);
RecordWorkerStoppedOnce(errorCode.ToString());
_metrics?.Fault(errorCode.ToString()); _metrics?.Fault(errorCode.ToString());
_logger.LogWarning(exception, "Worker client faulted for session {SessionId}: {Message}", SessionId, message); _logger.LogWarning(exception, "Worker client faulted for session {SessionId}: {Message}", SessionId, message);
} }
private void RecordWorkerStoppedOnce(string reason)
{
bool shouldRecord;
lock (_syncRoot)
{
shouldRecord = _workerStartRecorded;
_workerStartRecorded = false;
}
if (shouldRecord)
{
_metrics?.WorkerStopped(reason);
}
}
private void CompletePendingCommands(Exception exception) private void CompletePendingCommands(Exception exception)
{ {
foreach (KeyValuePair<string, PendingCommand> item in _pendingCommands.ToArray()) foreach (KeyValuePair<string, PendingCommand> item in _pendingCommands.ToArray())
{ {
if (_pendingCommands.TryRemove(item.Key, out PendingCommand? pendingCommand)) if (_pendingCommands.TryRemove(item.Key, out PendingCommand? pendingCommand))
{ {
ReleasePendingCommandSlot();
TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp); TimeSpan duration = _timeProvider.GetElapsedTime(pendingCommand.StartTimestamp);
_metrics?.CommandFailed(pendingCommand.Method, exception.GetType().Name, duration); _metrics?.CommandFailed(pendingCommand.Method, exception.GetType().Name, duration);
pendingCommand.SetException(exception); pendingCommand.SetException(exception);
@@ -592,6 +630,17 @@ public sealed class WorkerClient : IWorkerClient
} }
} }
private void ReleasePendingCommandSlot()
{
try
{
_pendingCommandSlots.Release();
}
catch (SemaphoreFullException)
{
}
}
private void TransitionFromCreatedToHandshaking() private void TransitionFromCreatedToHandshaking()
{ {
lock (_syncRoot) lock (_syncRoot)
@@ -11,4 +11,5 @@ public enum WorkerClientErrorCode
ShutdownTimeout, ShutdownTimeout,
GatewayShutdown, GatewayShutdown,
WriteFailed, WriteFailed,
PendingCommandLimitExceeded,
} }
@@ -12,6 +12,7 @@ public sealed class WorkerClientOptions
HeartbeatCheckInterval = DefaultHeartbeatCheckInterval; HeartbeatCheckInterval = DefaultHeartbeatCheckInterval;
EventChannelCapacity = 1_024; EventChannelCapacity = 1_024;
EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout; EventChannelFullModeTimeout = DefaultEventChannelFullModeTimeout;
MaxPendingCommands = 128;
} }
public TimeSpan HeartbeatGrace { get; init; } public TimeSpan HeartbeatGrace { get; init; }
@@ -21,4 +22,6 @@ public sealed class WorkerClientOptions
public int EventChannelCapacity { get; init; } public int EventChannelCapacity { get; init; }
public TimeSpan EventChannelFullModeTimeout { get; init; } public TimeSpan EventChannelFullModeTimeout { get; init; }
public int MaxPendingCommands { get; init; }
} }
@@ -96,8 +96,6 @@ public sealed class WorkerProcessLauncher : IWorkerProcessLauncher
startupTimeout.Token) startupTimeout.Token)
.ConfigureAwait(false); .ConfigureAwait(false);
_metrics.WorkerStarted(_timeProvider.GetUtcNow() - startedAt);
return new WorkerProcessHandle(process, commandLine, startedAt); return new WorkerProcessHandle(process, commandLine, startedAt);
} }
catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested) catch (OperationCanceledException exception) when (!cancellationToken.IsCancellationRequested)
@@ -42,8 +42,15 @@ public sealed class DashboardSnapshotServiceTests
DateTimeOffset.Parse("2026-04-26T10:01:00Z")); DateTimeOffset.Parse("2026-04-26T10:01:00Z"));
faultedSession.AttachWorkerClient(new FakeWorkerClient("session-faulted", 1202, WorkerClientState.Faulted)); faultedSession.AttachWorkerClient(new FakeWorkerClient("session-faulted", 1202, WorkerClientState.Faulted));
faultedSession.MarkFaulted("worker pipe disconnected"); faultedSession.MarkFaulted("worker pipe disconnected");
GatewaySession closedSession = CreateSession(
"session-closed",
"client-three",
DateTimeOffset.Parse("2026-04-26T09:59:00Z"));
closedSession.AttachWorkerClient(new FakeWorkerClient("session-closed", 1203, WorkerClientState.Closed));
closedSession.TransitionTo(SessionState.Closed);
registry.TryAdd(activeSession); registry.TryAdd(activeSession);
registry.TryAdd(faultedSession); registry.TryAdd(faultedSession);
registry.TryAdd(closedSession);
using GatewayMetrics metrics = new(); using GatewayMetrics metrics = new();
metrics.SessionOpened(); metrics.SessionOpened();
metrics.SessionOpened(); metrics.SessionOpened();
@@ -55,10 +62,15 @@ public sealed class DashboardSnapshotServiceTests
DashboardSnapshot snapshot = service.GetSnapshot(); DashboardSnapshot snapshot = service.GetSnapshot();
Assert.Equal(2, snapshot.Sessions.Count); Assert.Equal(3, snapshot.Sessions.Count);
Assert.Equal("session-faulted", snapshot.Sessions[0].SessionId); Assert.Equal("session-faulted", snapshot.Sessions[0].SessionId);
Assert.Equal(SessionState.Faulted, snapshot.Sessions[0].State); Assert.Equal(SessionState.Faulted, snapshot.Sessions[0].State);
DashboardSessionSummary activeSummary = Assert.Single(
snapshot.Sessions,
session => session.SessionId == "session-active");
Assert.Equal(1, activeSummary.EventsReceived);
Assert.Equal(2, snapshot.Workers.Count); Assert.Equal(2, snapshot.Workers.Count);
Assert.DoesNotContain(snapshot.Workers, worker => worker.SessionId == "session-closed");
Assert.Contains(snapshot.Metrics, metric => metric.Name == "mxgateway.commands.started" && metric.Value == 1); Assert.Contains(snapshot.Metrics, metric => metric.Name == "mxgateway.commands.started" && metric.Value == 1);
Assert.Contains( Assert.Contains(
snapshot.Metrics, snapshot.Metrics,
@@ -32,6 +32,35 @@ public sealed class SessionManagerTests
Assert.Equal(1, metrics.GetSnapshot().SessionsOpened); Assert.Equal(1, metrics.GetSnapshot().SessionsOpened);
} }
[Fact]
public async Task OpenSessionAsync_GeneratesClientCorrelationIdFromClientNameAndSessionId()
{
SessionOpenRequest request = CreateOpenRequest() with
{
ClientSessionName = "rust-load-client",
ClientCorrelationId = "caller-provided-correlation",
};
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(new FakeWorkerClient()));
GatewaySession session = await manager.OpenSessionAsync(request, "client-1", CancellationToken.None);
Assert.Equal($"rust-load-client-{session.SessionId}", session.ClientCorrelationId);
}
[Fact]
public async Task OpenSessionAsync_WhenClientSessionNameMissing_UsesClientCorrelationPrefix()
{
SessionOpenRequest request = CreateOpenRequest() with
{
ClientSessionName = "",
};
SessionManager manager = CreateManager(new FakeSessionWorkerClientFactory(new FakeWorkerClient()));
GatewaySession session = await manager.OpenSessionAsync(request, "client-1", CancellationToken.None);
Assert.Equal($"client-{session.SessionId}", session.ClientCorrelationId);
}
[Fact] [Fact]
public async Task InvokeAsync_WhenSessionReady_ForwardsCommandToWorker() public async Task InvokeAsync_WhenSessionReady_ForwardsCommandToWorker()
{ {
@@ -111,7 +140,7 @@ public sealed class SessionManagerTests
} }
[Fact] [Fact]
public async Task CloseSessionAsync_WhenCalledTwice_IsIdempotent() public async Task CloseSessionAsync_RemovesClosedSession()
{ {
FakeWorkerClient workerClient = new(); FakeWorkerClient workerClient = new();
using GatewayMetrics metrics = new(); using GatewayMetrics metrics = new();
@@ -119,12 +148,12 @@ public sealed class SessionManagerTests
GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None); GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None);
SessionCloseResult firstClose = await manager.CloseSessionAsync(session.SessionId, CancellationToken.None); SessionCloseResult firstClose = await manager.CloseSessionAsync(session.SessionId, CancellationToken.None);
SessionCloseResult secondClose = await manager.CloseSessionAsync(session.SessionId, CancellationToken.None); SessionManagerException secondClose = await Assert.ThrowsAsync<SessionManagerException>(
async () => await manager.CloseSessionAsync(session.SessionId, CancellationToken.None));
Assert.False(firstClose.AlreadyClosed); Assert.False(firstClose.AlreadyClosed);
Assert.True(secondClose.AlreadyClosed);
Assert.Equal(SessionState.Closed, firstClose.FinalState); Assert.Equal(SessionState.Closed, firstClose.FinalState);
Assert.Equal(SessionState.Closed, secondClose.FinalState); Assert.Equal(SessionManagerErrorCode.SessionNotFound, secondClose.ErrorCode);
Assert.Equal(1, workerClient.ShutdownCount); Assert.Equal(1, workerClient.ShutdownCount);
Assert.Equal(1, metrics.GetSnapshot().SessionsClosed); Assert.Equal(1, metrics.GetSnapshot().SessionsClosed);
Assert.Equal(0, metrics.GetSnapshot().OpenSessions); Assert.Equal(0, metrics.GetSnapshot().OpenSessions);
@@ -2,6 +2,7 @@ using System.IO.Pipes;
using Google.Protobuf.WellKnownTypes; using Google.Protobuf.WellKnownTypes;
using MxGateway.Contracts; using MxGateway.Contracts;
using MxGateway.Contracts.Proto; using MxGateway.Contracts.Proto;
using MxGateway.Server.Metrics;
using MxGateway.Server.Workers; using MxGateway.Server.Workers;
namespace MxGateway.Tests.Gateway.Workers; namespace MxGateway.Tests.Gateway.Workers;
@@ -152,6 +153,27 @@ public sealed class WorkerClientTests
Assert.Equal(WorkerClientState.Faulted, client.State); Assert.Equal(WorkerClientState.Faulted, client.State);
} }
[Fact]
public async Task ReadLoop_WhenPipeDisconnects_StopsRunningWorkerMetric()
{
await using PipePair pipePair = await PipePair.CreateAsync();
using GatewayMetrics metrics = new();
await using WorkerClient client = CreateClient(pipePair, metrics: metrics);
await CompleteHandshakeAsync(client, pipePair);
Assert.Equal(1, metrics.GetSnapshot().WorkersRunning);
await pipePair.DisposeWorkerSideAsync();
await WaitUntilAsync(
() => client.State == WorkerClientState.Faulted,
TestTimeout);
GatewayMetricsSnapshot snapshot = metrics.GetSnapshot();
Assert.Equal(0, snapshot.WorkersRunning);
Assert.Equal(1, snapshot.WorkerExits);
}
[Fact] [Fact]
public async Task ReadLoop_WhenHeartbeatArrives_UpdatesLastHeartbeatAndWorkerProcess() public async Task ReadLoop_WhenHeartbeatArrives_UpdatesLastHeartbeatAndWorkerProcess()
{ {
@@ -193,7 +215,8 @@ public sealed class WorkerClientTests
private static WorkerClient CreateClient( private static WorkerClient CreateClient(
PipePair pipePair, PipePair pipePair,
WorkerClientOptions? options = null) WorkerClientOptions? options = null,
GatewayMetrics? metrics = null)
{ {
WorkerFrameProtocolOptions frameOptions = new(SessionId); WorkerFrameProtocolOptions frameOptions = new(SessionId);
WorkerClientConnection connection = new( WorkerClientConnection connection = new(
@@ -202,7 +225,7 @@ public sealed class WorkerClientTests
pipePair.GatewayStream, pipePair.GatewayStream,
frameOptions); frameOptions);
return new WorkerClient(connection, options); return new WorkerClient(connection, options, metrics);
} }
private static async Task CompleteHandshakeAsync( private static async Task CompleteHandshakeAsync(
@@ -43,7 +43,7 @@ public sealed class WorkerProcessLauncherTests
Assert.DoesNotContain(Nonce, handle.CommandLine.ToString(), StringComparison.Ordinal); Assert.DoesNotContain(Nonce, handle.CommandLine.ToString(), StringComparison.Ordinal);
Assert.DoesNotContain(Nonce, string.Join(" ", handle.CommandLine.Arguments), StringComparison.Ordinal); Assert.DoesNotContain(Nonce, string.Join(" ", handle.CommandLine.Arguments), StringComparison.Ordinal);
Assert.False(pipeReservation.DisposeCalled); Assert.False(pipeReservation.DisposeCalled);
Assert.Equal(1, metrics.GetSnapshot().WorkersRunning); Assert.Equal(0, metrics.GetSnapshot().WorkersRunning);
} }
[Fact] [Fact]
@@ -17,7 +17,8 @@ public sealed class GatewayMetricsTests
metrics.CommandFailed("WriteSecured", "AuthorizationFailed", TimeSpan.FromMilliseconds(12)); metrics.CommandFailed("WriteSecured", "AuthorizationFailed", TimeSpan.FromMilliseconds(12));
metrics.EventReceived("session-1", "OnDataChange"); metrics.EventReceived("session-1", "OnDataChange");
metrics.EventReceived("session-1", "OnDataChange"); metrics.EventReceived("session-1", "OnDataChange");
metrics.SetEventQueueDepth(7); metrics.SetWorkerEventQueueDepth(7);
metrics.SetGrpcEventStreamQueueDepth(3);
metrics.QueueOverflow("session-events"); metrics.QueueOverflow("session-events");
metrics.Fault("CommandTimeout"); metrics.Fault("CommandTimeout");
metrics.WorkerKilled("CommandTimeout"); metrics.WorkerKilled("CommandTimeout");
@@ -30,7 +31,8 @@ public sealed class GatewayMetricsTests
Assert.Equal(0, snapshot.OpenSessions); Assert.Equal(0, snapshot.OpenSessions);
Assert.Equal(0, snapshot.WorkersRunning); Assert.Equal(0, snapshot.WorkersRunning);
Assert.Equal(7, snapshot.EventQueueDepth); Assert.Equal(7, snapshot.WorkerEventQueueDepth);
Assert.Equal(3, snapshot.GrpcEventStreamQueueDepth);
Assert.Equal(1, snapshot.SessionsOpened); Assert.Equal(1, snapshot.SessionsOpened);
Assert.Equal(1, snapshot.SessionsClosed); Assert.Equal(1, snapshot.SessionsClosed);
Assert.Equal(2, snapshot.CommandsStarted); Assert.Equal(2, snapshot.CommandsStarted);
@@ -45,6 +47,7 @@ public sealed class GatewayMetricsTests
Assert.Equal(1, snapshot.StreamDisconnects); Assert.Equal(1, snapshot.StreamDisconnects);
Assert.Equal(1, snapshot.CommandFailuresByMethod["WriteSecured"]); Assert.Equal(1, snapshot.CommandFailuresByMethod["WriteSecured"]);
Assert.Equal(2, snapshot.EventsByFamily["OnDataChange"]); Assert.Equal(2, snapshot.EventsByFamily["OnDataChange"]);
Assert.Equal(2, snapshot.EventsBySession["session-1"]);
} }
[Fact] [Fact]
@@ -53,7 +56,7 @@ public sealed class GatewayMetricsTests
using GatewayMetrics metrics = new(); using GatewayMetrics metrics = new();
ArgumentOutOfRangeException exception = Assert.Throws<ArgumentOutOfRangeException>( ArgumentOutOfRangeException exception = Assert.Throws<ArgumentOutOfRangeException>(
() => metrics.SetEventQueueDepth(-1)); () => metrics.SetWorkerEventQueueDepth(-1));
Assert.Equal("depth", exception.ParamName); Assert.Equal("depth", exception.ParamName);
} }
@@ -1,4 +1,5 @@
using System; using System;
using System.Collections.Generic;
using System.IO; using System.IO;
using System.IO.Pipes; using System.IO.Pipes;
using System.Threading; using System.Threading;
@@ -228,6 +229,21 @@ public sealed class WorkerPipeClientTests
currentCommandCorrelationId: string.Empty); currentCommandCorrelationId: string.Empty);
} }
public IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents)
{
return Array.Empty<WorkerEvent>();
}
public WorkerFault? DrainFault()
{
return null;
}
public bool CancelCommand(string correlationId)
{
return false;
}
public void RequestShutdown() public void RequestShutdown()
{ {
} }
@@ -238,6 +238,37 @@ public sealed class WorkerPipeSessionTests
await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token);
} }
[Fact]
public async Task RunAsync_WhenRuntimeHasEvents_WritesWorkerEventEnvelope()
{
using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5));
using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token);
FakeRuntimeSession runtime = new();
WorkerPipeSession session = CreatePipeSession(
pipePair.WorkerStream,
runtime,
new WorkerPipeSessionOptions
{
HeartbeatInterval = TimeSpan.FromMilliseconds(100),
HeartbeatGrace = TimeSpan.FromSeconds(5),
});
Task runTask = session.RunAsync(cancellation.Token);
await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token);
runtime.EnqueueEvent(CreateWorkerEvent(sequence: 7));
WorkerEnvelope workerEvent = await ReadUntilAsync(
pipePair.GatewayReader,
WorkerEnvelope.BodyOneofCase.WorkerEvent,
cancellation.Token);
Assert.Equal(MxEventFamily.OnDataChange, workerEvent.WorkerEvent.Event.Family);
Assert.Equal(7UL, workerEvent.WorkerEvent.Event.WorkerSequence);
await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token);
}
[Fact] [Fact]
public async Task RunAsync_WhenStaActivityIsStale_WritesWatchdogFault() public async Task RunAsync_WhenStaActivityIsStale_WritesWatchdogFault()
{ {
@@ -364,6 +395,20 @@ public sealed class WorkerPipeSessionTests
}; };
} }
private static WorkerEvent CreateWorkerEvent(ulong sequence)
{
return new WorkerEvent
{
Event = new MxEvent
{
SessionId = SessionId,
Family = MxEventFamily.OnDataChange,
WorkerSequence = sequence,
OnDataChange = new OnDataChangeEvent(),
},
};
}
private static async Task CompleteGatewayHandshakeAsync( private static async Task CompleteGatewayHandshakeAsync(
PipePair pipePair, PipePair pipePair,
CancellationToken cancellationToken) CancellationToken cancellationToken)
@@ -478,6 +523,7 @@ public sealed class WorkerPipeSessionTests
{ {
private readonly ManualResetEventSlim releaseDispatch = new(false); private readonly ManualResetEventSlim releaseDispatch = new(false);
private readonly object gate = new(); private readonly object gate = new();
private readonly Queue<WorkerEvent> events = new();
private WorkerRuntimeHeartbeatSnapshot snapshot = new( private WorkerRuntimeHeartbeatSnapshot snapshot = new(
DateTimeOffset.UtcNow, DateTimeOffset.UtcNow,
pendingCommandCount: 0, pendingCommandCount: 0,
@@ -550,6 +596,33 @@ public sealed class WorkerPipeSessionTests
} }
} }
public IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents)
{
lock (gate)
{
int drainCount = maxEvents == 0
? events.Count
: Math.Min(events.Count, checked((int)Math.Min(maxEvents, int.MaxValue)));
List<WorkerEvent> drained = new(drainCount);
for (int index = 0; index < drainCount; index++)
{
drained.Add(events.Dequeue());
}
return drained;
}
}
public WorkerFault? DrainFault()
{
return null;
}
public bool CancelCommand(string correlationId)
{
return false;
}
public void RequestShutdown() public void RequestShutdown()
{ {
releaseDispatch.Set(); releaseDispatch.Set();
@@ -576,6 +649,14 @@ public sealed class WorkerPipeSessionTests
} }
} }
public void EnqueueEvent(WorkerEvent workerEvent)
{
lock (gate)
{
events.Enqueue(workerEvent);
}
}
public void Dispose() public void Dispose()
{ {
releaseDispatch.Set(); releaseDispatch.Set();
+16 -4
View File
@@ -148,10 +148,22 @@ public sealed class WorkerPipeClient : IWorkerPipeClient
}) })
.Build(); .Build();
return await pipeline.ExecuteAsync( using CancellationTokenSource connectDeadline =
async token => await ConnectSingleAttemptAsync(pipeName, token).ConfigureAwait(false), CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cancellationToken) connectDeadline.CancelAfter(_connectTimeoutMilliseconds);
.ConfigureAwait(false);
try
{
return await pipeline.ExecuteAsync(
async token => await ConnectSingleAttemptAsync(pipeName, token).ConfigureAwait(false),
connectDeadline.Token)
.ConfigureAwait(false);
}
catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested)
{
throw new TimeoutException(
$"Worker pipe {pipeName} did not connect within {_connectTimeoutMilliseconds}ms.");
}
} }
private async Task<NamedPipeClientStream> ConnectSingleAttemptAsync( private async Task<NamedPipeClientStream> ConnectSingleAttemptAsync(
+68 -1
View File
@@ -14,6 +14,9 @@ namespace MxGateway.Worker.Ipc;
public sealed class WorkerPipeSession public sealed class WorkerPipeSession
{ {
private static readonly TimeSpan EventDrainInterval = TimeSpan.FromMilliseconds(25);
private const uint EventDrainBatchSize = 128;
private readonly WorkerFrameProtocolOptions _options; private readonly WorkerFrameProtocolOptions _options;
private readonly Func<int> _processIdProvider; private readonly Func<int> _processIdProvider;
private readonly Func<IWorkerRuntimeSession> _runtimeSessionFactory; private readonly Func<IWorkerRuntimeSession> _runtimeSessionFactory;
@@ -206,17 +209,22 @@ public sealed class WorkerPipeSession
using CancellationTokenSource heartbeatCancellation = CancellationTokenSource using CancellationTokenSource heartbeatCancellation = CancellationTokenSource
.CreateLinkedTokenSource(cancellationToken); .CreateLinkedTokenSource(cancellationToken);
Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token); Task heartbeatTask = RunHeartbeatLoopAsync(heartbeatCancellation.Token);
Task eventDrainTask = RunEventDrainLoopAsync(heartbeatCancellation.Token);
try try
{ {
while (!cancellationToken.IsCancellationRequested) while (!cancellationToken.IsCancellationRequested)
{ {
Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken); Task<WorkerEnvelope> readTask = _reader.ReadAsync(cancellationToken);
Task completedTask = await Task.WhenAny(readTask, heartbeatTask).ConfigureAwait(false); Task completedTask = await Task.WhenAny(readTask, heartbeatTask, eventDrainTask).ConfigureAwait(false);
if (completedTask == heartbeatTask) if (completedTask == heartbeatTask)
{ {
await heartbeatTask.ConfigureAwait(false); await heartbeatTask.ConfigureAwait(false);
} }
else if (completedTask == eventDrainTask)
{
await eventDrainTask.ConfigureAwait(false);
}
WorkerEnvelope envelope = await readTask.ConfigureAwait(false); WorkerEnvelope envelope = await readTask.ConfigureAwait(false);
bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false); bool keepReading = await DispatchGatewayEnvelopeAsync(envelope, cancellationToken).ConfigureAwait(false);
@@ -236,6 +244,52 @@ public sealed class WorkerPipeSession
catch (OperationCanceledException) catch (OperationCanceledException)
{ {
} }
try
{
await eventDrainTask.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
}
}
}
private async Task RunEventDrainLoopAsync(CancellationToken cancellationToken)
{
while (!cancellationToken.IsCancellationRequested)
{
IWorkerRuntimeSession? runtimeSession = _runtimeSession;
if (runtimeSession is null)
{
await Task.Delay(EventDrainInterval, cancellationToken).ConfigureAwait(false);
continue;
}
WorkerFault? fault = runtimeSession.DrainFault();
if (fault is not null)
{
_state = WorkerState.Faulted;
await TryWriteFaultAsync(fault, cancellationToken).ConfigureAwait(false);
throw new InvalidOperationException(
string.IsNullOrWhiteSpace(fault.DiagnosticMessage)
? $"MXAccess event queue faulted with category {fault.Category}."
: fault.DiagnosticMessage);
}
IReadOnlyList<WorkerEvent> events = runtimeSession.DrainEvents(EventDrainBatchSize);
if (events.Count == 0)
{
await Task.Delay(EventDrainInterval, cancellationToken).ConfigureAwait(false);
continue;
}
foreach (WorkerEvent workerEvent in events)
{
await _writer
.WriteAsync(CreateEnvelope(workerEvent), cancellationToken)
.ConfigureAwait(false);
}
} }
} }
@@ -252,6 +306,7 @@ public sealed class WorkerPipeSession
await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false); await ShutdownAsync(envelope.WorkerShutdown, cancellationToken).ConfigureAwait(false);
return false; return false;
case WorkerEnvelope.BodyOneofCase.WorkerCancel: case WorkerEnvelope.BodyOneofCase.WorkerCancel:
_runtimeSession?.CancelCommand(envelope.CorrelationId);
return true; return true;
default: default:
throw new WorkerFrameProtocolException( throw new WorkerFrameProtocolException(
@@ -461,6 +516,11 @@ public sealed class WorkerPipeSession
return CreateBaseEnvelope(reply); return CreateBaseEnvelope(reply);
} }
private WorkerEnvelope CreateEnvelope(WorkerEvent workerEvent)
{
return CreateBaseEnvelope(workerEvent);
}
private WorkerEnvelope CreateEnvelope(WorkerShutdownAck shutdownAck) private WorkerEnvelope CreateEnvelope(WorkerShutdownAck shutdownAck)
{ {
return CreateBaseEnvelope(shutdownAck); return CreateBaseEnvelope(shutdownAck);
@@ -500,6 +560,13 @@ public sealed class WorkerPipeSession
return envelope; return envelope;
} }
private WorkerEnvelope CreateBaseEnvelope(WorkerEvent body)
{
WorkerEnvelope envelope = CreateBaseEnvelope();
envelope.WorkerEvent = body;
return envelope;
}
private WorkerEnvelope CreateBaseEnvelope(WorkerShutdownAck body) private WorkerEnvelope CreateBaseEnvelope(WorkerShutdownAck body)
{ {
WorkerEnvelope envelope = CreateBaseEnvelope(); WorkerEnvelope envelope = CreateBaseEnvelope();
@@ -1,4 +1,5 @@
using System; using System;
using System.Collections.Generic;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using MxGateway.Contracts.Proto; using MxGateway.Contracts.Proto;
@@ -17,6 +18,12 @@ public interface IWorkerRuntimeSession : IDisposable
WorkerRuntimeHeartbeatSnapshot CaptureHeartbeat(); WorkerRuntimeHeartbeatSnapshot CaptureHeartbeat();
IReadOnlyList<WorkerEvent> DrainEvents(uint maxEvents);
WorkerFault? DrainFault();
bool CancelCommand(string correlationId);
void RequestShutdown(); void RequestShutdown();
Task<MxAccessShutdownResult> ShutdownGracefullyAsync( Task<MxAccessShutdownResult> ShutdownGracefullyAsync(
@@ -14,6 +14,7 @@ public sealed class MxAccessEventQueue
private readonly object syncRoot = new(); private readonly object syncRoot = new();
private ulong lastEventSequence; private ulong lastEventSequence;
private WorkerFault? fault; private WorkerFault? fault;
private bool faultDrained;
public MxAccessEventQueue() public MxAccessEventQueue()
: this(DefaultCapacity) : this(DefaultCapacity)
@@ -163,6 +164,20 @@ public sealed class MxAccessEventQueue
} }
} }
public WorkerFault? DrainFault()
{
lock (syncRoot)
{
if (fault is null || faultDrained)
{
return null;
}
faultDrained = true;
return fault.Clone();
}
}
private WorkerFault CreateOverflowFault() private WorkerFault CreateOverflowFault()
{ {
string message = $"MXAccess outbound event queue reached capacity {capacity}."; string message = $"MXAccess outbound event queue reached capacity {capacity}.";
@@ -79,7 +79,14 @@ public sealed class MxAccessSession : IDisposable
} }
catch (Exception exception) catch (Exception exception)
{ {
eventSink.Detach(); try
{
eventSink.Detach();
}
catch
{
// Preserve the creation failure while still releasing the COM object below.
}
if (mxAccessComObject is not null && Marshal.IsComObject(mxAccessComObject)) if (mxAccessComObject is not null && Marshal.IsComObject(mxAccessComObject))
{ {
@@ -535,13 +542,15 @@ public sealed class MxAccessSession : IDisposable
private void DisposeCore(ICollection<MxAccessShutdownFailure>? failures) private void DisposeCore(ICollection<MxAccessShutdownFailure>? failures)
{ {
Exception? detachException = null;
try try
{ {
eventSink.Detach(); eventSink.Detach();
} }
catch (Exception exception) when (failures is not null) catch (Exception exception)
{ {
failures.Add(new MxAccessShutdownFailure( detachException = exception;
failures?.Add(new MxAccessShutdownFailure(
"DetachEvents", "DetachEvents",
serverHandle: null, serverHandle: null,
itemHandle: null, itemHandle: null,
@@ -565,6 +574,10 @@ public sealed class MxAccessSession : IDisposable
} }
disposed = true; disposed = true;
if (detachException is not null && failures is null)
{
throw detachException;
}
} }
private void ThrowIfDisposed() private void ThrowIfDisposed()
@@ -127,6 +127,16 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
return eventQueue.Drain(maxEvents); return eventQueue.Drain(maxEvents);
} }
public WorkerFault? DrainFault()
{
return eventQueue.DrainFault();
}
public bool CancelCommand(string correlationId)
{
return commandDispatcher?.CancelQueuedCommand(correlationId) ?? false;
}
public Task<IReadOnlyList<RegisteredServerHandle>> GetRegisteredServerHandlesAsync( public Task<IReadOnlyList<RegisteredServerHandle>> GetRegisteredServerHandlesAsync(
CancellationToken cancellationToken = default) CancellationToken cancellationToken = default)
{ {
@@ -207,7 +217,14 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
throw new TimeoutException($"MXAccess graceful shutdown exceeded {timeout}."); throw new TimeoutException($"MXAccess graceful shutdown exceeded {timeout}.");
} }
result = await cleanupTask.ConfigureAwait(false); try
{
result = await cleanupTask.ConfigureAwait(false);
}
catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested)
{
throw new TimeoutException($"MXAccess graceful shutdown exceeded {timeout}.");
}
} }
TimeSpan remaining = timeout - stopwatch.Elapsed; TimeSpan remaining = timeout - stopwatch.Elapsed;
@@ -232,7 +249,17 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession
if (session is not null) if (session is not null)
{ {
staRuntime.InvokeAsync(() => session.Dispose()).GetAwaiter().GetResult(); try
{
staRuntime.InvokeAsync(() => session.Dispose())
.Wait(TimeSpan.FromSeconds(2));
}
catch (AggregateException)
{
}
catch (ObjectDisposedException)
{
}
} }
staRuntime.Dispose(); staRuntime.Dispose();
@@ -8,10 +8,13 @@ namespace MxGateway.Worker.Sta;
public sealed class StaCommandDispatcher public sealed class StaCommandDispatcher
{ {
public const int DefaultMaxPendingCommands = 128;
private readonly HResultConverter hresultConverter; private readonly HResultConverter hresultConverter;
private readonly IStaCommandExecutor commandExecutor; private readonly IStaCommandExecutor commandExecutor;
private readonly Queue<QueuedStaCommand> commandQueue = new(); private readonly Queue<QueuedStaCommand> commandQueue = new();
private readonly StaRuntime staRuntime; private readonly StaRuntime staRuntime;
private readonly int maxPendingCommands;
private readonly object gate = new(); private readonly object gate = new();
private bool drainActive; private bool drainActive;
private bool shutdownRequested; private bool shutdownRequested;
@@ -28,10 +31,27 @@ public sealed class StaCommandDispatcher
StaRuntime staRuntime, StaRuntime staRuntime,
IStaCommandExecutor commandExecutor, IStaCommandExecutor commandExecutor,
HResultConverter hresultConverter) HResultConverter hresultConverter)
: this(staRuntime, commandExecutor, hresultConverter, DefaultMaxPendingCommands)
{ {
}
public StaCommandDispatcher(
StaRuntime staRuntime,
IStaCommandExecutor commandExecutor,
HResultConverter hresultConverter,
int maxPendingCommands)
{
if (maxPendingCommands <= 0)
{
throw new ArgumentOutOfRangeException(
nameof(maxPendingCommands),
"Max pending STA commands must be greater than zero.");
}
this.staRuntime = staRuntime ?? throw new ArgumentNullException(nameof(staRuntime)); this.staRuntime = staRuntime ?? throw new ArgumentNullException(nameof(staRuntime));
this.commandExecutor = commandExecutor ?? throw new ArgumentNullException(nameof(commandExecutor)); this.commandExecutor = commandExecutor ?? throw new ArgumentNullException(nameof(commandExecutor));
this.hresultConverter = hresultConverter ?? throw new ArgumentNullException(nameof(hresultConverter)); this.hresultConverter = hresultConverter ?? throw new ArgumentNullException(nameof(hresultConverter));
this.maxPendingCommands = maxPendingCommands;
} }
public int PendingCommandCount public int PendingCommandCount
@@ -73,6 +93,14 @@ public sealed class StaCommandDispatcher
"The STA command dispatcher is shutting down.")); "The STA command dispatcher is shutting down."));
} }
if (commandQueue.Count >= maxPendingCommands)
{
return Task.FromResult(CreateRejectedReply(
command,
ProtocolStatusCode.WorkerUnavailable,
$"The STA command dispatcher already has {maxPendingCommands} pending command(s)."));
}
QueuedStaCommand queuedCommand = new(command); QueuedStaCommand queuedCommand = new(command);
commandQueue.Enqueue(queuedCommand); commandQueue.Enqueue(queuedCommand);
@@ -86,6 +114,51 @@ public sealed class StaCommandDispatcher
} }
} }
public bool CancelQueuedCommand(string correlationId)
{
if (string.IsNullOrWhiteSpace(correlationId))
{
return false;
}
lock (gate)
{
if (commandQueue.Count == 0)
{
return false;
}
bool canceled = false;
Queue<QueuedStaCommand> retainedCommands = new(commandQueue.Count);
while (commandQueue.Count > 0)
{
QueuedStaCommand queuedCommand = commandQueue.Dequeue();
if (!canceled
&& string.Equals(
queuedCommand.Command.CorrelationId,
correlationId,
StringComparison.Ordinal))
{
queuedCommand.Complete(CreateRejectedReply(
queuedCommand.Command,
ProtocolStatusCode.Canceled,
"The STA command was canceled before execution."));
canceled = true;
continue;
}
retainedCommands.Enqueue(queuedCommand);
}
while (retainedCommands.Count > 0)
{
commandQueue.Enqueue(retainedCommands.Dequeue());
}
return canceled;
}
}
public void RequestShutdown() public void RequestShutdown()
{ {
lock (gate) lock (gate)