Merge feat/telemetry-followons: telemetry follow-ons for MxAccessGateway

Metric normalization: meter MxGateway.Server -> ZB.MOM.WW.MxGateway and the 3
duration histograms ms -> s (safe: never Prometheus-exported before). Config-driven
OTLP exporter opt-in (default Prometheus). Metrics.md synced; doc-review artifacts
gitignored.
This commit is contained in:
Joseph Doherty
2026-06-01 17:17:31 -04:00
4 changed files with 24 additions and 14 deletions
+5
View File
@@ -147,3 +147,8 @@ generated-scratch/
# Keep empty directories with .gitkeep files when needed
!.gitkeep
# Documentation review artifacts (CommentChecker output)
*-docs-issues.md
*-docs-fixed.md
*-docs-final.md
+6 -6
View File
@@ -4,7 +4,7 @@ The metrics subsystem exposes counters, histograms, and observable gauges that d
## Overview
`GatewayMetrics` is a singleton (registered in `GatewayApplication.cs`) that owns a single `Meter` named `ZB.MOM.WW.MxGateway.Server` and a set of synchronised counters, histograms, and observable gauges. Subsystems call typed mutator methods (`SessionOpened`, `CommandFailed`, `EventReceived`, etc.) rather than touching the `Meter` directly, which keeps the OpenTelemetry instrument names and tag conventions in one place. A `lock (_syncRoot)` block guards the scalar fields used by `GetSnapshot`, while per-event maps use `ConcurrentDictionary<string, long>` so the hot event path avoids the lock.
`GatewayMetrics` is a singleton (registered in `GatewayApplication.cs`) that owns a single `Meter` named `ZB.MOM.WW.MxGateway` and a set of synchronised counters, histograms, and observable gauges. Subsystems call typed mutator methods (`SessionOpened`, `CommandFailed`, `EventReceived`, etc.) rather than touching the `Meter` directly, which keeps the OpenTelemetry instrument names and tag conventions in one place. A `lock (_syncRoot)` block guards the scalar fields used by `GetSnapshot`, while per-event maps use `ConcurrentDictionary<string, long>` so the hot event path avoids the lock.
## Meter and OpenTelemetry Compatibility
@@ -13,7 +13,7 @@ The meter name is exposed as a constant so that hosting code can register it wit
```csharp
public sealed class GatewayMetrics : IDisposable
{
public const string MeterName = "ZB.MOM.WW.MxGateway.Server";
public const string MeterName = "ZB.MOM.WW.MxGateway";
public GatewayMetrics()
{
@@ -50,12 +50,12 @@ All counters are `Counter<long>`. Tag values come from the call sites listed und
### Histograms
Histograms record durations in milliseconds (the `unit` argument on `CreateHistogram`):
Histograms record durations in seconds (the `unit` argument on `CreateHistogram`):
```csharp
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "s");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "s");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "s");
```
| Instrument | Tags | What it measures |
@@ -78,6 +78,11 @@ public static class GatewayApplication
{
o.ServiceName = "mxgateway";
o.Meters = [GatewayMetrics.MeterName]; // "MxGateway.Server" — name unchanged
if (Enum.TryParse<ZbExporter>(builder.Configuration["MxGateway:Telemetry:Exporter"], ignoreCase: true, out var exporter))
o.Exporter = exporter;
var otlp = builder.Configuration["MxGateway:Telemetry:OtlpEndpoint"];
if (!string.IsNullOrWhiteSpace(otlp))
o.OtlpEndpoint = otlp;
});
builder.Services.AddSingleton<ILogRedactor, GatewayLogRedactorSeam>();
builder.Services.AddSingleton<MxAccessGrpcMapper>();
@@ -5,7 +5,7 @@ namespace ZB.MOM.WW.MxGateway.Server.Metrics;
public sealed class GatewayMetrics : IDisposable
{
public const string MeterName = "MxGateway.Server";
public const string MeterName = "ZB.MOM.WW.MxGateway";
private readonly object _syncRoot = new();
private readonly Meter _meter;
@@ -68,9 +68,9 @@ public sealed class GatewayMetrics : IDisposable
_heartbeatFailuresCounter = _meter.CreateCounter<long>("mxgateway.heartbeats.failed");
_streamDisconnectsCounter = _meter.CreateCounter<long>("mxgateway.grpc.streams.disconnected");
_retryAttemptsCounter = _meter.CreateCounter<long>("mxgateway.retries.attempted");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "ms");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "ms");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "ms");
_workerStartupLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.workers.startup.duration", "s");
_commandLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.commands.duration", "s");
_eventStreamSendLatencyHistogram = _meter.CreateHistogram<double>("mxgateway.events.stream_send.duration", "s");
_meter.CreateObservableGauge("mxgateway.sessions.open", GetOpenSessions);
_meter.CreateObservableGauge("mxgateway.workers.running", GetWorkersRunning);
@@ -144,7 +144,7 @@ public sealed class GatewayMetrics : IDisposable
_workersRunning++;
}
_workerStartupLatencyHistogram.Record(startupDuration.TotalMilliseconds);
_workerStartupLatencyHistogram.Record(startupDuration.TotalSeconds);
}
/// <summary>
@@ -208,7 +208,7 @@ public sealed class GatewayMetrics : IDisposable
KeyValuePair<string, object?> methodTag = new("method", method);
_commandsSucceededCounter.Add(1, methodTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag);
_commandLatencyHistogram.Record(duration.TotalSeconds, methodTag);
}
/// <summary>
@@ -228,7 +228,7 @@ public sealed class GatewayMetrics : IDisposable
KeyValuePair<string, object?> methodTag = new("method", method);
KeyValuePair<string, object?> categoryTag = new("category", category);
_commandsFailedCounter.Add(1, methodTag, categoryTag);
_commandLatencyHistogram.Record(duration.TotalMilliseconds, methodTag, categoryTag);
_commandLatencyHistogram.Record(duration.TotalSeconds, methodTag, categoryTag);
}
/// <summary>
@@ -255,7 +255,7 @@ public sealed class GatewayMetrics : IDisposable
public void RecordEventStreamSend(string family, TimeSpan duration)
{
_eventStreamSendLatencyHistogram.Record(
duration.TotalMilliseconds,
duration.TotalSeconds,
new KeyValuePair<string, object?>("family", family));
}