feat(lmxproxy): replace subscribe/unsubscribe health probe with persistent subscription

The old probe did a subscribe-read-unsubscribe cycle every 5 seconds to
check connection health. This created unnecessary churn and didn't detect
the failure mode where long-lived subscriptions silently stop receiving
COM callbacks (e.g. stalled STA message pump). The new approach keeps a
persistent subscription on the health check tag and forces reconnect if
no value update arrives within a configurable threshold (ProbeStaleThresholdMs,
default 5s). Also adds STA message pump debug logging (5-min heartbeat with
message counters) and fixes log file path resolution for Windows services.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-03-24 11:57:22 -04:00
parent b3222cf30b
commit 95168253fc
12 changed files with 112 additions and 155 deletions

View File

@@ -34,16 +34,13 @@ namespace ZB.MOM.WW.LmxProxy.Host.Configuration
/// <summary>Health check / probe configuration.</summary>
public class HealthCheckConfiguration
{
/// <summary>Tag address to probe for connection liveness. Default: DevPlatform.Scheduler.ScanTime.</summary>
/// <summary>Tag address to subscribe to for connection liveness. Default: DevPlatform.Scheduler.ScanTime.</summary>
public string TestTagAddress { get; set; } = "DevPlatform.Scheduler.ScanTime";
/// <summary>Probe timeout in milliseconds. Default: 5000.</summary>
public int ProbeTimeoutMs { get; set; } = 5000;
/// <summary>Consecutive transport failures before forced reconnect. Default: 3.</summary>
public int MaxConsecutiveTransportFailures { get; set; } = 3;
/// <summary>Probe interval while in degraded state (ms). Default: 30000 (30s).</summary>
public int DegradedProbeIntervalMs { get; set; } = 30000;
/// <summary>
/// Maximum time (ms) without a value update on the test tag before forcing reconnect.
/// Default: 5000 (5 seconds).
/// </summary>
public int ProbeStaleThresholdMs { get; set; } = 5000;
}
}

View File

@@ -57,12 +57,6 @@ namespace ZB.MOM.WW.LmxProxy.Host.Domain
int pollIntervalMs,
CancellationToken ct = default);
/// <summary>
/// Probes connection health by reading a test tag.
/// Returns a classified result: Healthy, TransportFailure, or DataDegraded.
/// </summary>
Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default);
/// <summary>
/// Unsubscribes specific tag addresses. Removes from stored subscriptions
/// and COM state. Safe to call after reconnect -- uses current handle mappings.

View File

@@ -68,9 +68,7 @@ namespace ZB.MOM.WW.LmxProxy.Host
nodeName: _config.Connection.NodeName,
galaxyName: _config.Connection.GalaxyName,
probeTestTagAddress: _config.HealthCheck.TestTagAddress,
probeTimeoutMs: _config.HealthCheck.ProbeTimeoutMs,
maxConsecutiveTransportFailures: _config.HealthCheck.MaxConsecutiveTransportFailures,
degradedProbeIntervalMs: _config.HealthCheck.DegradedProbeIntervalMs,
probeStaleThresholdMs: _config.HealthCheck.ProbeStaleThresholdMs,
clientName: _config.ClientName);
// 5. Connect to MxAccess synchronously (with timeout)

View File

@@ -35,6 +35,9 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
// Recreate any stored subscriptions from a previous connection
await RecreateStoredSubscriptionsAsync();
// Start persistent probe subscription
await StartProbeSubscriptionAsync();
}
catch (Exception ex)
{
@@ -172,87 +175,61 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
}
/// <summary>
/// Probes the connection by reading a test tag with a timeout.
/// Classifies the result as transport failure vs data degraded.
/// Subscribes to the configured probe test tag so that OnDataChange
/// callbacks update <see cref="_lastProbeValueTime"/>. Called after
/// connect (and reconnect). The subscription is stored for reconnect
/// replay like any other subscription.
/// </summary>
public async Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs,
CancellationToken ct = default)
private async Task StartProbeSubscriptionAsync()
{
if (!IsConnected)
return ProbeResult.TransportFailed("Not connected");
if (_probeTestTagAddress == null) return;
try
_lastProbeValueTime = DateTime.UtcNow;
await _staThread.RunAsync(() =>
{
using (var cts = CancellationTokenSource.CreateLinkedTokenSource(ct))
lock (_lock)
{
cts.CancelAfter(timeoutMs);
if (!IsConnected || _lmxProxy == null) return;
Vtq vtq;
try
{
vtq = await ReadAsync(testTagAddress, cts.Token);
}
catch (OperationCanceledException) when (!ct.IsCancellationRequested)
{
// Our timeout fired, not the caller's -- treat as transport failure
return ProbeResult.TransportFailed("Probe read timed out after " + timeoutMs + "ms");
}
// Subscribe (skips if already subscribed from reconnect replay)
SubscribeInternal(_probeTestTagAddress);
if (vtq.Quality == Domain.Quality.Bad_NotConnected ||
vtq.Quality == Domain.Quality.Bad_CommFailure)
{
return ProbeResult.TransportFailed("Probe returned " + vtq.Quality);
}
if (!vtq.Quality.IsGood())
{
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
"Probe quality: " + vtq.Quality);
}
if (DateTime.UtcNow - vtq.Timestamp > TimeSpan.FromMinutes(5))
{
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
"Probe data stale (>" + 5 + "min)");
}
return ProbeResult.Healthy(vtq.Quality, vtq.Timestamp);
// Store a no-op callback — the real work happens in OnProbeDataChange
// which is called from OnDataChange before the stored callback
_storedSubscriptions[_probeTestTagAddress] = (_, __) => { };
}
}
catch (System.Runtime.InteropServices.COMException ex)
{
return ProbeResult.TransportFailed("COM exception: " + ex.Message, ex);
}
catch (InvalidOperationException ex) when (ex.Message.Contains("Not connected"))
{
return ProbeResult.TransportFailed(ex.Message, ex);
}
catch (Exception ex)
{
return ProbeResult.TransportFailed("Probe failed: " + ex.Message, ex);
}
});
Log.Information("Probe subscription started for {Tag} (stale threshold={ThresholdMs}ms)",
_probeTestTagAddress, _probeStaleThresholdMs);
}
/// <summary>
/// Auto-reconnect monitor loop with active health probing.
/// - If IsConnected is false: immediate reconnect (existing behavior).
/// - If IsConnected is true and probe configured: read test tag each interval.
/// - TransportFailure for N consecutive probes -> forced disconnect + reconnect.
/// - DataDegraded -> stay connected, back off probe interval, report degraded.
/// - Healthy -> reset counters and resume normal interval.
/// Called from <see cref="OnDataChange"/> when a value arrives for the probe tag.
/// Updates the last-seen timestamp so the monitor loop can detect staleness.
/// </summary>
internal void OnProbeDataChange(string address, Vtq vtq)
{
_lastProbeValueTime = DateTime.UtcNow;
}
/// <summary>
/// Auto-reconnect monitor loop with persistent subscription probe.
/// - If disconnected: attempt reconnect.
/// - If connected and probe configured: check time since last probe value update.
/// If stale beyond threshold, force disconnect and reconnect.
/// </summary>
private async Task MonitorConnectionAsync(CancellationToken ct)
{
Log.Information("Connection monitor loop started (interval={IntervalMs}ms, probe={ProbeEnabled})",
_monitorIntervalMs, _probeTestTagAddress != null);
Log.Information("Connection monitor loop started (interval={IntervalMs}ms, probe={ProbeEnabled}, staleThreshold={StaleMs}ms)",
_monitorIntervalMs, _probeTestTagAddress != null, _probeStaleThresholdMs);
while (!ct.IsCancellationRequested)
{
var interval = _isDegraded ? _degradedProbeIntervalMs : _monitorIntervalMs;
try
{
await Task.Delay(interval, ct);
await Task.Delay(_monitorIntervalMs, ct);
}
catch (OperationCanceledException)
{
@@ -262,64 +239,31 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
// -- Case 1: Already disconnected --
if (!IsConnected)
{
_isDegraded = false;
_consecutiveTransportFailures = 0;
await AttemptReconnectAsync(ct);
continue;
}
// -- Case 2: Connected, no probe configured -- legacy behavior --
// -- Case 2: Connected, no probe configured --
if (_probeTestTagAddress == null)
continue;
// -- Case 3: Connected, probe configured -- active health check --
var probe = await ProbeConnectionAsync(_probeTestTagAddress, _probeTimeoutMs, ct);
switch (probe.Status)
// -- Case 3: Connected, check probe staleness --
var elapsed = DateTime.UtcNow - _lastProbeValueTime;
if (elapsed.TotalMilliseconds > _probeStaleThresholdMs)
{
case ProbeStatus.Healthy:
if (_isDegraded)
{
Log.Information("Probe healthy -- exiting degraded mode");
_isDegraded = false;
}
_consecutiveTransportFailures = 0;
break;
Log.Warning("Probe tag {Tag} stale for {ElapsedMs}ms (threshold={ThresholdMs}ms) — forcing reconnect",
_probeTestTagAddress, (int)elapsed.TotalMilliseconds, _probeStaleThresholdMs);
case ProbeStatus.DataDegraded:
_consecutiveTransportFailures = 0;
if (!_isDegraded)
{
Log.Warning("Probe degraded: {Message} -- entering degraded mode (probe interval {IntervalMs}ms)",
probe.Message, _degradedProbeIntervalMs);
_isDegraded = true;
}
break;
try
{
await DisconnectAsync(ct);
}
catch (Exception ex)
{
Log.Warning(ex, "Error during forced disconnect before reconnect");
}
case ProbeStatus.TransportFailure:
_isDegraded = false;
_consecutiveTransportFailures++;
Log.Warning("Probe transport failure ({Count}/{Max}): {Message}",
_consecutiveTransportFailures, _maxConsecutiveTransportFailures, probe.Message);
if (_consecutiveTransportFailures >= _maxConsecutiveTransportFailures)
{
Log.Warning("Max consecutive transport failures reached -- forcing reconnect");
_consecutiveTransportFailures = 0;
try
{
await DisconnectAsync(ct);
}
catch (Exception ex)
{
Log.Warning(ex, "Error during forced disconnect before reconnect");
// DisconnectAsync already calls CleanupComObjectsAsync on error path
}
await AttemptReconnectAsync(ct);
}
break;
await AttemptReconnectAsync(ct);
}
}

View File

@@ -66,6 +66,13 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
}
}
// Update probe timestamp if this is the probe tag
if (_probeTestTagAddress != null &&
string.Equals(address, _probeTestTagAddress, StringComparison.OrdinalIgnoreCase))
{
OnProbeDataChange(address, vtq);
}
callback.Invoke(address, vtq);
// Also route to the SubscriptionManager's global handler

View File

@@ -48,13 +48,10 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
// Probe configuration
private readonly string? _probeTestTagAddress;
private readonly int _probeTimeoutMs;
private readonly int _maxConsecutiveTransportFailures;
private readonly int _degradedProbeIntervalMs;
private readonly int _probeStaleThresholdMs;
// Probe state
private int _consecutiveTransportFailures;
private bool _isDegraded;
// Probe state — updated by OnDataChange callback, read by monitor loop
private DateTime _lastProbeValueTime;
// Stored subscriptions for reconnect replay
private readonly Dictionary<string, Action<string, Vtq>> _storedSubscriptions
@@ -80,9 +77,7 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
string? nodeName = null,
string? galaxyName = null,
string? probeTestTagAddress = null,
int probeTimeoutMs = 5000,
int maxConsecutiveTransportFailures = 3,
int degradedProbeIntervalMs = 30000,
int probeStaleThresholdMs = 5000,
string? clientName = null)
{
_maxConcurrentOperations = maxConcurrentOperations;
@@ -93,9 +88,7 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
_nodeName = nodeName;
_galaxyName = galaxyName;
_probeTestTagAddress = probeTestTagAddress;
_probeTimeoutMs = probeTimeoutMs;
_maxConsecutiveTransportFailures = maxConsecutiveTransportFailures;
_degradedProbeIntervalMs = degradedProbeIntervalMs;
_probeStaleThresholdMs = probeStaleThresholdMs;
_clientName = clientName ?? "LmxProxy-" + Guid.NewGuid().ToString("N").Substring(0, 8);
_readSemaphore = new SemaphoreSlim(maxConcurrentOperations, maxConcurrentOperations);

View File

@@ -19,6 +19,7 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
private const uint PM_NOREMOVE = 0x0000;
private static readonly ILogger Log = Serilog.Log.ForContext<StaComThread>();
private static readonly TimeSpan PumpLogInterval = TimeSpan.FromMinutes(5);
private readonly Thread _thread;
private readonly TaskCompletionSource<bool> _ready = new TaskCompletionSource<bool>();
@@ -26,6 +27,12 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
private volatile uint _nativeThreadId;
private bool _disposed;
private long _totalMessages;
private long _appMessages;
private long _dispatchedMessages;
private long _workItemsExecuted;
private DateTime _lastLogTime;
public StaComThread()
{
_thread = new Thread(ThreadEntry)
@@ -125,12 +132,18 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
PeekMessage(out msg, IntPtr.Zero, 0, 0, PM_NOREMOVE);
_ready.TrySetResult(true);
_lastLogTime = DateTime.UtcNow;
Log.Debug("STA message pump entering loop");
// Run the message loop — blocks until WM_QUIT
while (GetMessage(out msg, IntPtr.Zero, 0, 0) > 0)
{
_totalMessages++;
if (msg.message == WM_APP)
{
_appMessages++;
DrainQueue();
}
else if (msg.message == WM_APP + 1)
@@ -141,10 +154,16 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
}
else
{
_dispatchedMessages++;
TranslateMessage(ref msg);
DispatchMessage(ref msg);
}
LogPumpStatsIfDue();
}
Log.Information("STA message pump exited loop (Total={Total}, App={App}, Dispatched={Dispatched}, WorkItems={WorkItems})",
_totalMessages, _appMessages, _dispatchedMessages, _workItemsExecuted);
}
catch (Exception ex)
{
@@ -157,6 +176,7 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
{
while (_workItems.TryDequeue(out var workItem))
{
_workItemsExecuted++;
try
{
workItem();
@@ -168,6 +188,16 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
}
}
private void LogPumpStatsIfDue()
{
var now = DateTime.UtcNow;
if (now - _lastLogTime < PumpLogInterval) return;
Log.Debug("STA pump alive: Total={Total}, App={App}, Dispatched={Dispatched}, WorkItems={WorkItems}, Pending={Pending}",
_totalMessages, _appMessages, _dispatchedMessages, _workItemsExecuted, _workItems.Count);
_lastLogTime = now;
}
#region Win32 PInvoke
[StructLayout(LayoutKind.Sequential)]

View File

@@ -17,7 +17,10 @@ namespace ZB.MOM.WW.LmxProxy.Host
.AddEnvironmentVariables()
.Build();
// 2. Configure Serilog
// 2. Set working directory to exe location so relative log paths resolve correctly
Environment.CurrentDirectory = AppDomain.CurrentDomain.BaseDirectory;
// 3. Configure Serilog
Log.Logger = new LoggerConfiguration()
.ReadFrom.Configuration(configuration)
.Enrich.FromLogContext()
@@ -27,11 +30,11 @@ namespace ZB.MOM.WW.LmxProxy.Host
try
{
// 3. Bind configuration
// 4. Bind configuration
var config = new LmxProxyConfiguration();
configuration.Bind(config);
// 4. Configure Topshelf
// 5. Configure Topshelf
var exitCode = HostFactory.Run(host =>
{
host.UseSerilog();

View File

@@ -34,9 +34,7 @@
"HealthCheck": {
"TestTagAddress": "DevPlatform.Scheduler.ScanTime",
"ProbeTimeoutMs": 5000,
"MaxConsecutiveTransportFailures": 3,
"DegradedProbeIntervalMs": 30000
"ProbeStaleThresholdMs": 5000
},
"ServiceRecovery": {
@@ -58,7 +56,8 @@
"Override": {
"Microsoft": "Warning",
"System": "Warning",
"Grpc": "Information"
"Grpc": "Information",
"ZB.MOM.WW.LmxProxy.Host.MxAccess.StaComThread": "Debug"
}
},
"WriteTo": [

View File

@@ -32,8 +32,6 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Health
IReadOnlyDictionary<string, object> values, string flagTag, object flagValue,
int timeoutMs, int pollIntervalMs, CancellationToken ct = default) =>
Task.FromResult((false, 0));
public Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default) =>
Task.FromResult(ProbeResult.Healthy(Quality.Good, DateTime.UtcNow));
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses) => Task.CompletedTask;
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default) =>
Task.FromResult<IAsyncDisposable>(new FakeHandle());
@@ -158,8 +156,6 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Health
IReadOnlyDictionary<string, object> values, string flagTag, object flagValue,
int timeoutMs, int pollIntervalMs, CancellationToken ct = default) =>
Task.FromResult((false, 0));
public Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default) =>
Task.FromResult(ProbeResult.Healthy(Quality.Good, DateTime.UtcNow));
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses) => Task.CompletedTask;
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default) =>
Task.FromResult<IAsyncDisposable>(new FakeHandle());

View File

@@ -33,8 +33,6 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Status
IReadOnlyDictionary<string, object> values, string flagTag, object flagValue,
int timeoutMs, int pollIntervalMs, CancellationToken ct = default) =>
Task.FromResult((false, 0));
public Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default) =>
Task.FromResult(ProbeResult.Healthy(Quality.Good, DateTime.UtcNow));
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses) => Task.CompletedTask;
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default) =>
Task.FromResult<IAsyncDisposable>(new FakeHandle());

View File

@@ -30,8 +30,6 @@ namespace ZB.MOM.WW.LmxProxy.Host.Tests.Subscriptions
IReadOnlyDictionary<string, object> values, string flagTag, object flagValue,
int timeoutMs, int pollIntervalMs, CancellationToken ct = default) =>
Task.FromResult((false, 0));
public Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default) =>
Task.FromResult(ProbeResult.Healthy(Quality.Good, DateTime.UtcNow));
public Task UnsubscribeByAddressAsync(IEnumerable<string> addresses) => Task.CompletedTask;
public Task<IAsyncDisposable> SubscribeAsync(IEnumerable<string> addresses, Action<string, Vtq> callback, CancellationToken ct = default) =>
Task.FromResult<IAsyncDisposable>(new FakeSubscriptionHandle());