Improve gateway reliability and dashboard docs

This commit is contained in:
Joseph Doherty
2026-04-28 00:13:22 -04:00
parent bd4a09a35e
commit 4fc355b357
61 changed files with 1722 additions and 150 deletions
@@ -21,6 +21,11 @@ public static class DashboardDisplay
return string.IsNullOrWhiteSpace(value) ? "-" : value;
}
public static string Count(long value)
{
return value.ToString("N0", System.Globalization.CultureInfo.InvariantCulture);
}
public static long MetricValue(DashboardSnapshot snapshot, string name, string? dimension = null)
{
return snapshot.Metrics.FirstOrDefault(metric =>
@@ -20,13 +20,13 @@ else
<section class="metric-grid">
<MetricCard Label="Uptime" Value="@DashboardDisplay.Duration(Snapshot.GatewayUptime)" Detail="@Snapshot.GatewayVersion" />
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Workers Running" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Faults" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Open Sessions" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.sessions.open"))" />
<MetricCard Label="Workers Running" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.workers.running"))" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Commands Failed" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.commands.failed"))" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Faults" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.faults"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
</section>
<section class="dashboard-section">
@@ -18,10 +18,11 @@ else
</div>
<section class="metric-grid compact">
<MetricCard Label="Events Received" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Event Queue Depth" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.queue.depth").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected").ToString(System.Globalization.CultureInfo.InvariantCulture)" />
<MetricCard Label="Events Received" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.received"))" />
<MetricCard Label="Worker Event Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.worker_queue.depth"))" />
<MetricCard Label="Stream Queue Depth" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.events.grpc_stream_queue.depth"))" />
<MetricCard Label="Queue Overflows" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.queues.overflows"))" />
<MetricCard Label="Stream Disconnects" Value="@DashboardDisplay.Count(DashboardDisplay.MetricValue(Snapshot, "mxgateway.grpc.streams.disconnected"))" />
</section>
<section class="dashboard-section">
@@ -47,7 +48,7 @@ else
{
<tr>
<td>@metric.Dimension</td>
<td>@metric.Value</td>
<td>@DashboardDisplay.Count(metric.Value)</td>
</tr>
}
</tbody>
@@ -39,6 +39,7 @@ else
<tr><th scope="row">Opened</th><td>@DashboardDisplay.DateTime(CurrentSession.OpenedAt)</td></tr>
<tr><th scope="row">Last activity</th><td>@DashboardDisplay.DateTime(CurrentSession.LastClientActivityAt)</td></tr>
<tr><th scope="row">Lease expires</th><td>@DashboardDisplay.DateTime(CurrentSession.LeaseExpiresAt)</td></tr>
<tr><th scope="row">Events received</th><td>@DashboardDisplay.Count(CurrentSession.EventsReceived)</td></tr>
<tr><th scope="row">Last fault</th><td>@DashboardDisplay.Text(CurrentSession.LastFault)</td></tr>
</tbody>
</table>
@@ -33,6 +33,7 @@ else
<th scope="col">Client</th>
<th scope="col">Backend</th>
<th scope="col">Worker</th>
<th scope="col">Events</th>
<th scope="col">Opened</th>
<th scope="col">Activity</th>
<th scope="col">Heartbeat</th>
@@ -54,6 +55,7 @@ else
<span class="ms-1"><StatusBadge Text="@session.WorkerState.ToString()" /></span>
}
</td>
<td>@DashboardDisplay.Count(session.EventsReceived)</td>
<td>@DashboardDisplay.DateTime(session.OpenedAt)</td>
<td>@DashboardDisplay.DateTime(session.LastClientActivityAt)</td>
<td>@DashboardDisplay.DateTime(session.LastWorkerHeartbeatAt)</td>
@@ -16,4 +16,5 @@ public sealed record DashboardSessionSummary(
int? WorkerProcessId,
WorkerClientState? WorkerState,
DateTimeOffset? LastWorkerHeartbeatAt,
long EventsReceived,
string? LastFault);
@@ -45,15 +45,15 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
IReadOnlyList<GatewaySession> sessions = _sessionRegistry.Snapshot()
.OrderByDescending(session => session.OpenedAt)
.ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
IReadOnlyList<DashboardSessionSummary> sessionSummaries = sessions
.Take(ResolveLimit(_recentSessionLimit))
.Select(CreateSessionSummary)
.Select(session => CreateSessionSummary(session, metricsSnapshot))
.ToArray();
IReadOnlyList<DashboardWorkerSummary> workerSummaries = sessions
.Where(session => session.WorkerClient is not null)
.Where(session => session.WorkerClient is { State: not WorkerClientState.Closed })
.Select(CreateWorkerSummary)
.ToArray();
GatewayMetricsSnapshot metricsSnapshot = _metrics.GetSnapshot();
return new DashboardSnapshot(
GeneratedAt: generatedAt,
@@ -100,9 +100,12 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
}
}
private static DashboardSessionSummary CreateSessionSummary(GatewaySession session)
private static DashboardSessionSummary CreateSessionSummary(
GatewaySession session,
GatewayMetricsSnapshot metricsSnapshot)
{
IWorkerClient? workerClient = session.WorkerClient;
metricsSnapshot.EventsBySession.TryGetValue(session.SessionId, out long eventsReceived);
return new DashboardSessionSummary(
SessionId: session.SessionId,
@@ -117,6 +120,7 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
WorkerProcessId: workerClient?.ProcessId,
WorkerState: workerClient?.State,
LastWorkerHeartbeatAt: workerClient?.LastHeartbeatAt,
EventsReceived: eventsReceived,
LastFault: DashboardRedactor.Redact(session.FinalFault));
}
@@ -138,7 +142,8 @@ public sealed class DashboardSnapshotService : IDashboardSnapshotService
[
new("mxgateway.sessions.open", snapshot.OpenSessions),
new("mxgateway.workers.running", snapshot.WorkersRunning),
new("mxgateway.events.queue.depth", snapshot.EventQueueDepth),
new("mxgateway.events.worker_queue.depth", snapshot.WorkerEventQueueDepth),
new("mxgateway.events.grpc_stream_queue.depth", snapshot.GrpcEventStreamQueueDepth),
new("mxgateway.sessions.opened", snapshot.SessionsOpened),
new("mxgateway.sessions.closed", snapshot.SessionsClosed),
new("mxgateway.commands.started", snapshot.CommandsStarted),