Compare commits

..

15 Commits

Author SHA1 Message Date
Joseph Doherty 679562e5ed Merge feat/telemetry-followons: telemetry follow-ons for MxAccessGateway
Metric normalization: meter MxGateway.Server -> ZB.MOM.WW.MxGateway and the 3
duration histograms ms -> s (safe: never Prometheus-exported before). Config-driven
OTLP exporter opt-in (default Prometheus). Metrics.md synced; doc-review artifacts
gitignored.
2026-06-01 17:17:31 -04:00
Joseph Doherty dbf550da8b docs(mxgateway): sync Metrics.md to renamed meter + seconds histogram units 2026-06-01 16:48:46 -04:00
Joseph Doherty 3965a7741e feat(mxgateway): config-driven OTLP exporter opt-in (default Prometheus) 2026-06-01 16:44:40 -04:00
Joseph Doherty abb2cfb84b feat(mxgateway): normalize metrics — meter ZB.MOM.WW.MxGateway + histograms in seconds 2026-06-01 16:39:56 -04:00
Joseph Doherty 4e0d8ccfed chore(mxgateway): gitignore CommentChecker doc-review artifacts 2026-06-01 16:34:46 -04:00
Joseph Doherty a935aa8b7c Merge feat/adopt-zb-telemetry: adopt ZB.MOM.WW.Telemetry across MxAccessGateway
Full MEL->Serilog migration via AddZbSerilog; GatewayLogRedactor exposed through
the shared ILogRedactor seam; GatewayMetrics now exports via AddZbTelemetry + new
/metrics (meter name MxGateway.Server + ms histogram units unchanged; rename/unit
conversion deferred). Behaviour-preserving.
2026-06-01 16:05:41 -04:00
Joseph Doherty 9912389fa1 feat(mxgateway): export GatewayMetrics via AddZbTelemetry + /metrics (name/units unchanged) 2026-06-01 15:53:46 -04:00
Joseph Doherty f1129b969d feat(mxgateway): expose GatewayLogRedactor via shared ILogRedactor seam 2026-06-01 15:49:32 -04:00
Joseph Doherty c51b6f9ce4 feat(mxgateway): adopt AddZbSerilog — MEL→Serilog provider swap (behaviour-preserving) 2026-06-01 15:43:10 -04:00
Joseph Doherty e39972357b config(mxgateway): translate MEL Logging section to Serilog 2026-06-01 15:32:38 -04:00
Joseph Doherty 9ad17e2964 build(mxgateway): reference ZB.MOM.WW.Telemetry + Serilog packages 2026-06-01 15:29:43 -04:00
Joseph Doherty ef0a883a81 Merge feat/adopt-zb-health: ZB.MOM.WW.Health adoption + TLS auto-cert/lenient-client-trust feature 2026-06-01 14:09:24 -04:00
Joseph Doherty 62ba5e9487 feat: map canonical ZB health tiers; replace bypassing /health/live 2026-06-01 13:44:13 -04:00
Joseph Doherty 136614be94 feat: add AuthStoreHealthCheck readiness probe 2026-06-01 13:33:54 -04:00
Joseph Doherty a912bffad5 build: reference ZB.MOM.WW.Health from the Gitea feed 2026-06-01 13:29:39 -04:00
13 changed files with 264 additions and 38 deletions
+5
View File
@@ -147,3 +147,8 @@ generated-scratch/
# Keep empty directories with .gitkeep files when needed
!.gitkeep
# Documentation review artifacts (CommentChecker output)
*-docs-issues.md
*-docs-fixed.md
*-docs-final.md
+6 -6
View File
@@ -4,7 +4,7 @@ The metrics subsystem exposes counters, histograms, and observable gauges that d
## Overview
`GatewayMetrics` is a singleton (registered in `GatewayApplication.cs`) that owns a single `Meter` named `ZB.MOM.WW.MxGateway.Server` and a set of synchronised counters, histograms, and observable gauges. Subsystems call typed mutator methods (`SessionOpened`, `CommandFailed`, `EventReceived`, etc.) rather than touching the `Meter` directly, which keeps the OpenTelemetry instrument names and tag conventions in one place. A `lock (_syncRoot)` block guards the scalar fields used by `GetSnapshot`, while per-event maps use `ConcurrentDictionary<string, long>` so the hot event path avoids the lock.
`GatewayMetrics` is a singleton (registered in `GatewayApplication.cs`) that owns a single `Meter` named `ZB.MOM.WW.MxGateway` and a set of synchronised counters, histograms, and observable gauges. Subsystems call typed mutator methods (`SessionOpened`, `CommandFailed`, `EventReceived`, etc.) rather than touching the `Meter` directly, which keeps the OpenTelemetry instrument names and tag conventions in one place. A `lock (_syncRoot)` block guards the scalar fields used by `GetSnapshot`, while per-event maps use `ConcurrentDictionary<string, long>` so the hot event path avoids the lock.
## Meter and OpenTelemetry Compatibility
@@ -13,7 +13,7 @@ The meter name is exposed as a constant so that hosting code can register it wit
```csharp
public sealed class GatewayMetrics : IDisposable
{
public const string MeterName = "ZB.MOM.WW.MxGateway.Server";
public const string MeterName = "ZB.MOM.WW.MxGateway";
public GatewayMetrics()
{
@@ -50,12 +50,12 @@ All counters are `Counter<long>`. Tag values come from the call sites listed und
### Histograms
Histograms record durations in milliseconds (the `unit` argument on `CreateHistogram`):
Histograms record durations in seconds (the `unit` argument on `CreateHistogram`):
```csharp
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "s");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "s");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "s");
```
| Instrument | Tags | What it measures |
+21
View File
@@ -0,0 +1,21 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<packageSources>
<clear />
<add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
<add key="dohertj2-gitea" value="https://gitea.dohertylan.com/api/packages/dohertj2/nuget/index.json" />
</packageSources>
<!-- nuget.org serves everything; the Gitea feed serves only the ZB.MOM.WW.* shared libs.
Credentials are NOT committed: they are provided per-developer at the user level. -->
<packageSourceMapping>
<packageSource key="nuget.org">
<package pattern="*" />
</packageSource>
<packageSource key="dohertj2-gitea">
<package pattern="ZB.MOM.WW.Health" />
<package pattern="ZB.MOM.WW.Health.*" />
<package pattern="ZB.MOM.WW.Telemetry" />
<package pattern="ZB.MOM.WW.Telemetry.*" />
</packageSource>
</packageSourceMapping>
</configuration>
@@ -0,0 +1,40 @@
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using ZB.MOM.WW.MxGateway.Server.Security.Authentication;
namespace ZB.MOM.WW.MxGateway.Server.Diagnostics;
/// <summary>
/// Readiness probe: verifies the SQLite authentication store is reachable. The gateway
/// authenticates every gRPC call against this store, so its reachability gates readiness.
/// </summary>
public sealed class AuthStoreHealthCheck : IHealthCheck
{
private readonly AuthSqliteConnectionFactory _connectionFactory;
public AuthStoreHealthCheck(AuthSqliteConnectionFactory connectionFactory) =>
_connectionFactory = connectionFactory ?? throw new ArgumentNullException(nameof(connectionFactory));
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
await using SqliteConnection connection =
await _connectionFactory.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
await using SqliteCommand command = connection.CreateCommand();
command.CommandText = "SELECT 1;";
await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return HealthCheckResult.Healthy("Auth store is reachable.");
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
throw;
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy("Auth store is unreachable.", ex);
}
}
}
@@ -0,0 +1,28 @@
using ZB.MOM.WW.Telemetry.Serilog;
namespace ZB.MOM.WW.MxGateway.Server.Diagnostics;
/// <summary>
/// Adapts the static <see cref="GatewayLogRedactor"/> to the shared <see cref="ILogRedactor"/> seam
/// so the telemetry RedactionEnricher masks API-key/credential material on every log event.
/// </summary>
public sealed class GatewayLogRedactorSeam : ILogRedactor
{
private static readonly string[] IdentityKeys = ["ClientIdentity", "authorization", "Authorization"];
/// <summary>
/// Masks API-key/credential material in known identity-bearing log properties.
/// </summary>
/// <param name="properties">The log event property dictionary to redact in place.</param>
public void Redact(IDictionary<string, object?> properties)
{
ArgumentNullException.ThrowIfNull(properties);
foreach (var key in IdentityKeys)
{
if (properties.TryGetValue(key, out var value) && value is string s)
{
properties[key] = GatewayLogRedactor.RedactClientIdentity(s);
}
}
}
}
@@ -2,6 +2,7 @@ using System.Security.Cryptography.X509Certificates;
using Microsoft.AspNetCore.Hosting.StaticWebAssets;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Configuration;
using ZB.MOM.WW.Health;
using ZB.MOM.WW.MxGateway.Contracts;
using ZB.MOM.WW.MxGateway.Server.Alarms;
using ZB.MOM.WW.MxGateway.Server.Configuration;
@@ -14,6 +15,8 @@ using ZB.MOM.WW.MxGateway.Server.Security.Authentication;
using ZB.MOM.WW.MxGateway.Server.Security.Authorization;
using ZB.MOM.WW.MxGateway.Server.Sessions;
using ZB.MOM.WW.MxGateway.Server.Workers;
using ZB.MOM.WW.Telemetry;
using ZB.MOM.WW.Telemetry.Serilog;
namespace ZB.MOM.WW.MxGateway.Server;
@@ -60,11 +63,28 @@ public static class GatewayApplication
ConfigureSelfSignedTls(builder);
builder.AddZbSerilog(o => o.ServiceName = "mxgateway");
builder.Services.AddGatewayConfiguration();
builder.Services.AddSqliteAuthStore();
builder.Services.AddGatewayGrpcAuthorization();
builder.Services.AddHealthChecks();
builder.Services.AddHealthChecks()
.AddTypeActivatedCheck<AuthStoreHealthCheck>(
"auth-store",
failureStatus: null,
tags: new[] { ZbHealthTags.Ready });
builder.Services.AddSingleton<GatewayMetrics>();
builder.AddZbTelemetry(o =>
{
o.ServiceName = "mxgateway";
o.Meters = [GatewayMetrics.MeterName]; // "MxGateway.Server" — name unchanged
if (Enum.TryParse<ZbExporter>(builder.Configuration["MxGateway:Telemetry:Exporter"], ignoreCase: true, out var exporter))
o.Exporter = exporter;
var otlp = builder.Configuration["MxGateway:Telemetry:OtlpEndpoint"];
if (!string.IsNullOrWhiteSpace(otlp))
o.OtlpEndpoint = otlp;
});
builder.Services.AddSingleton<ILogRedactor, GatewayLogRedactorSeam>();
builder.Services.AddSingleton<MxAccessGrpcMapper>();
builder.Services.AddSingleton<MxAccessGrpcRequestValidator>();
builder.Services.AddSingleton<IEventStreamService, EventStreamService>();
@@ -169,13 +189,8 @@ public static class GatewayApplication
{
endpoints.MapStaticAssets(ResolveStaticAssetsManifestPath());
endpoints.MapGet(
"/health/live",
() => Results.Ok(new GatewayHealthReply(
Status: "Healthy",
DefaultBackend: GatewayContractInfo.DefaultBackendName,
WorkerProtocolVersion: GatewayContractInfo.WorkerProtocolVersion)))
.WithName("LiveHealth");
endpoints.MapZbHealth();
endpoints.MapZbMetrics();
endpoints.MapGrpcService<MxAccessGatewayService>();
endpoints.MapGrpcService<GalaxyRepositoryGrpcService>();
@@ -5,7 +5,7 @@ namespace ZB.MOM.WW.MxGateway.Server.Metrics;
public sealed class GatewayMetrics : IDisposable
{
public const string MeterName = "MxGateway.Server";
public const string MeterName = "ZB.MOM.WW.MxGateway";
private readonly object _syncRoot = new();
private readonly Meter _meter;
@@ -68,9 +68,9 @@ public sealed class GatewayMetrics : IDisposable
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "s");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "s");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "s");
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
@@ -144,7 +144,7 @@ public sealed class GatewayMetrics : IDisposable
_workersRunning++;
}
_workerStartupLatencyHistogram.Record(startupDuration.TotalMilliseconds);
_workerStartupLatencyHistogram.Record(startupDuration.TotalSeconds);
}
/// <summary>
@@ -208,7 +208,7 @@ public sealed class GatewayMetrics : IDisposable
KeyValuePair<string, object?> methodTag = new("method", method);
_commandsSucceededCounter.Add(1, methodTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag);
_commandLatencyHistogram.Record(duration.TotalSeconds, methodTag);
}
/// <summary>
@@ -228,7 +228,7 @@ public sealed class GatewayMetrics : IDisposable
KeyValuePair<string, object?> methodTag = new("method", method);
KeyValuePair<string, object?> categoryTag = new("category", category);
_commandsFailedCounter.Add(1, methodTag, categoryTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag, categoryTag);
_commandLatencyHistogram.Record(duration.TotalSeconds, methodTag, categoryTag);
}
/// <summary>
@@ -255,7 +255,7 @@ public sealed class GatewayMetrics : IDisposable
public void RecordEventStreamSend(string family, TimeSpan duration)
{
_eventStreamSendLatencyHistogram.Record(
duration.TotalMilliseconds,
duration.TotalSeconds,
new KeyValuePair<string, object?>("family", family));
}
@@ -6,6 +6,12 @@
<ItemGroup>
<PackageReference Include="Grpc.AspNetCore" Version="2.76.0" />
<PackageReference Include="ZB.MOM.WW.Health" Version="0.1.0" />
<PackageReference Include="ZB.MOM.WW.Telemetry" Version="0.1.0" />
<PackageReference Include="ZB.MOM.WW.Telemetry.Serilog" Version="0.1.0" />
<PackageReference Include="Serilog.AspNetCore" Version="10.0.0" />
<PackageReference Include="Serilog.Sinks.Console" Version="6.1.1" />
<PackageReference Include="Serilog.Sinks.File" Version="7.0.0" />
<PackageReference Include="Microsoft.AspNetCore.SignalR.Client" Version="10.0.0" />
<PackageReference Include="Microsoft.Data.Sqlite" Version="10.0.7" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="6.0.2" />
@@ -1,8 +1,13 @@
{
"Logging": {
"LogLevel": {
"Serilog": {
"Using": [ "Serilog.Sinks.Console", "Serilog.Sinks.File" ],
"MinimumLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
}
"Override": { "Microsoft.AspNetCore": "Warning" }
},
"WriteTo": [
{ "Name": "Console" },
{ "Name": "File", "Args": { "path": "logs/mxgateway-.log", "rollingInterval": "Day" } }
]
}
}
@@ -1,9 +1,14 @@
{
"Logging": {
"LogLevel": {
"Serilog": {
"Using": [ "Serilog.Sinks.Console", "Serilog.Sinks.File" ],
"MinimumLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning"
}
"Override": { "Microsoft.AspNetCore": "Warning" }
},
"WriteTo": [
{ "Name": "Console" },
{ "Name": "File", "Args": { "path": "logs/mxgateway-.log", "rollingInterval": "Day" } }
]
},
"AllowedHosts": "*",
"MxGateway": {
@@ -0,0 +1,49 @@
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Server.Configuration;
using ZB.MOM.WW.MxGateway.Server.Diagnostics;
using ZB.MOM.WW.MxGateway.Server.Security.Authentication;
namespace ZB.MOM.WW.MxGateway.Tests.Diagnostics;
public sealed class AuthStoreHealthCheckTests
{
private static AuthSqliteConnectionFactory FactoryFor(string sqlitePath)
{
// GatewayOptions.Authentication and AuthenticationOptions.SqlitePath are both
// init-only, so populate them through object initializers.
var options = new GatewayOptions
{
Authentication = new AuthenticationOptions { SqlitePath = sqlitePath },
};
return new AuthSqliteConnectionFactory(Options.Create(options));
}
[Fact]
public async Task Healthy_WhenStoreReachable()
{
var path = Path.Combine(Path.GetTempPath(), $"authcheck-{Guid.NewGuid():N}.db");
try
{
var check = new AuthStoreHealthCheck(FactoryFor(path));
var result = await check.CheckHealthAsync(new HealthCheckContext());
Assert.Equal(HealthStatus.Healthy, result.Status);
}
finally { if (File.Exists(path)) File.Delete(path); }
}
[Fact]
public async Task Unhealthy_WhenPathUnusable()
{
// A regular file used as a parent directory forces the open to fail.
var bogus = Path.Combine(Path.GetTempPath(), $"authcheck-{Guid.NewGuid():N}");
await File.WriteAllTextAsync(bogus, "x");
try
{
var check = new AuthStoreHealthCheck(FactoryFor(Path.Combine(bogus, "store.db")));
var result = await check.CheckHealthAsync(new HealthCheckContext());
Assert.Equal(HealthStatus.Unhealthy, result.Status);
}
finally { if (File.Exists(bogus)) File.Delete(bogus); }
}
}
@@ -0,0 +1,15 @@
using System.Collections.Generic;
using ZB.MOM.WW.MxGateway.Server.Diagnostics;
using Xunit;
public class GatewayLogRedactorSeamTests
{
[Fact]
public void Redact_MasksApiKeyInClientIdentity()
{
var redactor = new GatewayLogRedactorSeam();
var props = new Dictionary<string, object?> { ["ClientIdentity"] = "Bearer mxgw_operator01_super-secret" };
redactor.Redact(props);
Assert.Equal("Bearer mxgw_operator01_[redacted]", props["ClientIdentity"]);
}
}
@@ -1,7 +1,10 @@
using System.Net;
using System.Net.Http;
using Microsoft.AspNetCore.Builder;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Routing;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.MxGateway.Server;
using ZB.MOM.WW.MxGateway.Server.Dashboard;
@@ -11,19 +14,31 @@ namespace ZB.MOM.WW.MxGateway.Tests.Gateway;
public sealed class GatewayApplicationTests
{
/// <summary>Verifies that Build maps the live health check endpoint.</summary>
/// <summary>Verifies that Build maps the canonical three health tiers.</summary>
[Fact]
public async Task Build_MapsLiveHealthEndpoint()
public async Task Build_MapsCanonicalHealthEndpoints()
{
await using WebApplication app = GatewayApplication.Build([]);
RouteEndpoint endpoint = Assert.Single(
((IEndpointRouteBuilder)app).DataSources
.SelectMany(dataSource => dataSource.Endpoints)
.OfType<RouteEndpoint>(),
candidate => candidate.RoutePattern.RawText == "/health/live");
var paths = ((IEndpointRouteBuilder)app).DataSources
.SelectMany(dataSource => dataSource.Endpoints)
.OfType<RouteEndpoint>()
.Select(e => e.RoutePattern.RawText)
.ToHashSet();
Assert.Equal("LiveHealth", endpoint.Metadata.GetMetadata<IEndpointNameMetadata>()?.EndpointName);
Assert.Contains("/health/ready", paths);
Assert.Contains("/health/active", paths);
Assert.Contains("/healthz", paths);
Assert.DoesNotContain("/health/live", paths);
}
/// <summary>Verifies that Build registers Serilog as the host logging provider.</summary>
[Fact]
public void Build_UsesSerilogLoggerProvider()
{
using var app = GatewayApplication.Build([]);
var factory = app.Services.GetRequiredService<ILoggerFactory>();
Assert.Equal("SerilogLoggerFactory", factory.GetType().Name);
}
/// <summary>Verifies that Build registers the gateway metrics service.</summary>
@@ -37,6 +52,28 @@ public sealed class GatewayApplicationTests
Assert.NotNull(metrics);
}
/// <summary>Verifies that Build mounts the Prometheus /metrics scrape endpoint.</summary>
[Fact]
public async Task Build_MapsMetricsEndpoint()
{
// Bind an ephemeral port (:0) — xUnit runs test collections in parallel, so any
// started-host test must avoid a fixed port to prevent a bind collision.
await using WebApplication app = GatewayApplication.Build(["--urls=http://127.0.0.1:0"]);
await app.StartAsync();
try
{
using var client = new HttpClient { BaseAddress = new Uri(app.Urls.First()) };
using HttpResponseMessage response = await client.GetAsync("/metrics");
Assert.Equal(HttpStatusCode.OK, response.StatusCode);
}
finally
{
await app.StopAsync();
}
}
/// <summary>Verifies that Build maps dashboard and authentication endpoints when the dashboard is enabled.</summary>
[Fact]
public async Task Build_WhenDashboardEnabled_MapsBlazorDashboardAndAuthEndpoints()