feat(lmxproxy): active health probing + address-based subscription cleanup (gap 1 & 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-03-22 06:44:21 -04:00
parent 86a15c0a65
commit a6c01d73e2
12 changed files with 301 additions and 43 deletions

View File

@@ -171,47 +171,180 @@ namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
}
/// <summary>
/// Auto-reconnect monitor loop. Checks connection every monitorInterval.
/// On disconnect, attempts reconnect. On failure, retries at next interval.
/// Probes the connection by reading a test tag with a timeout.
/// Classifies the result as transport failure vs data degraded.
/// </summary>
public async Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs,
CancellationToken ct = default)
{
if (!IsConnected)
return ProbeResult.TransportFailed("Not connected");
try
{
using (var cts = CancellationTokenSource.CreateLinkedTokenSource(ct))
{
cts.CancelAfter(timeoutMs);
Vtq vtq;
try
{
vtq = await ReadAsync(testTagAddress, cts.Token);
}
catch (OperationCanceledException) when (!ct.IsCancellationRequested)
{
// Our timeout fired, not the caller's -- treat as transport failure
return ProbeResult.TransportFailed("Probe read timed out after " + timeoutMs + "ms");
}
if (vtq.Quality == Domain.Quality.Bad_NotConnected ||
vtq.Quality == Domain.Quality.Bad_CommFailure)
{
return ProbeResult.TransportFailed("Probe returned " + vtq.Quality);
}
if (!vtq.Quality.IsGood())
{
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
"Probe quality: " + vtq.Quality);
}
if (DateTime.UtcNow - vtq.Timestamp > TimeSpan.FromMinutes(5))
{
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
"Probe data stale (>" + 5 + "min)");
}
return ProbeResult.Healthy(vtq.Quality, vtq.Timestamp);
}
}
catch (System.Runtime.InteropServices.COMException ex)
{
return ProbeResult.TransportFailed("COM exception: " + ex.Message, ex);
}
catch (InvalidOperationException ex) when (ex.Message.Contains("Not connected"))
{
return ProbeResult.TransportFailed(ex.Message, ex);
}
catch (Exception ex)
{
return ProbeResult.TransportFailed("Probe failed: " + ex.Message, ex);
}
}
/// <summary>
/// Auto-reconnect monitor loop with active health probing.
/// - If IsConnected is false: immediate reconnect (existing behavior).
/// - If IsConnected is true and probe configured: read test tag each interval.
/// - TransportFailure for N consecutive probes -> forced disconnect + reconnect.
/// - DataDegraded -> stay connected, back off probe interval, report degraded.
/// - Healthy -> reset counters and resume normal interval.
/// </summary>
private async Task MonitorConnectionAsync(CancellationToken ct)
{
Log.Information("Connection monitor loop started (interval={IntervalMs}ms)", _monitorIntervalMs);
Log.Information("Connection monitor loop started (interval={IntervalMs}ms, probe={ProbeEnabled})",
_monitorIntervalMs, _probeTestTagAddress != null);
while (!ct.IsCancellationRequested)
{
var interval = _isDegraded ? _degradedProbeIntervalMs : _monitorIntervalMs;
try
{
await Task.Delay(_monitorIntervalMs, ct);
await Task.Delay(interval, ct);
}
catch (OperationCanceledException)
{
break;
}
if (IsConnected) continue;
Log.Information("MxAccess disconnected, attempting reconnect...");
SetState(ConnectionState.Reconnecting);
try
// -- Case 1: Already disconnected --
if (!IsConnected)
{
await ConnectAsync(ct);
Log.Information("Reconnected to MxAccess successfully");
_isDegraded = false;
_consecutiveTransportFailures = 0;
await AttemptReconnectAsync(ct);
continue;
}
catch (OperationCanceledException)
// -- Case 2: Connected, no probe configured -- legacy behavior --
if (_probeTestTagAddress == null)
continue;
// -- Case 3: Connected, probe configured -- active health check --
var probe = await ProbeConnectionAsync(_probeTestTagAddress, _probeTimeoutMs, ct);
switch (probe.Status)
{
break;
}
catch (Exception ex)
{
Log.Warning(ex, "Reconnect attempt failed, will retry in {IntervalMs}ms", _monitorIntervalMs);
case ProbeStatus.Healthy:
if (_isDegraded)
{
Log.Information("Probe healthy -- exiting degraded mode");
_isDegraded = false;
}
_consecutiveTransportFailures = 0;
break;
case ProbeStatus.DataDegraded:
_consecutiveTransportFailures = 0;
if (!_isDegraded)
{
Log.Warning("Probe degraded: {Message} -- entering degraded mode (probe interval {IntervalMs}ms)",
probe.Message, _degradedProbeIntervalMs);
_isDegraded = true;
}
break;
case ProbeStatus.TransportFailure:
_isDegraded = false;
_consecutiveTransportFailures++;
Log.Warning("Probe transport failure ({Count}/{Max}): {Message}",
_consecutiveTransportFailures, _maxConsecutiveTransportFailures, probe.Message);
if (_consecutiveTransportFailures >= _maxConsecutiveTransportFailures)
{
Log.Warning("Max consecutive transport failures reached -- forcing reconnect");
_consecutiveTransportFailures = 0;
try
{
await DisconnectAsync(ct);
}
catch (Exception ex)
{
Log.Warning(ex, "Error during forced disconnect before reconnect");
// DisconnectAsync already calls CleanupComObjectsAsync on error path
}
await AttemptReconnectAsync(ct);
}
break;
}
}
Log.Information("Connection monitor loop exited");
}
private async Task AttemptReconnectAsync(CancellationToken ct)
{
Log.Information("Attempting reconnect...");
SetState(ConnectionState.Reconnecting);
try
{
await ConnectAsync(ct);
Log.Information("Reconnected to MxAccess successfully");
}
catch (OperationCanceledException)
{
// Let the outer loop handle cancellation
}
catch (Exception ex)
{
Log.Warning(ex, "Reconnect attempt failed, will retry at next interval");
}
}
/// <summary>
/// Cleans up COM objects via Task.Run after a failed connection.
/// </summary>