The old probe did a subscribe-read-unsubscribe cycle every 5 seconds to check connection health. This created unnecessary churn and didn't detect the failure mode where long-lived subscriptions silently stop receiving COM callbacks (e.g. stalled STA message pump). The new approach keeps a persistent subscription on the health check tag and forces reconnect if no value update arrives within a configurable threshold (ProbeStaleThresholdMs, default 5s). Also adds STA message pump debug logging (5-min heartbeat with message counters) and fixes log file path resolution for Windows services. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
325 lines
11 KiB
C#
325 lines
11 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Runtime.InteropServices;
|
|
using System.Threading;
|
|
using System.Threading.Tasks;
|
|
using ArchestrA.MxAccess;
|
|
using Serilog;
|
|
using ZB.MOM.WW.LmxProxy.Host.Domain;
|
|
|
|
namespace ZB.MOM.WW.LmxProxy.Host.MxAccess
|
|
{
|
|
public sealed partial class MxAccessClient
|
|
{
|
|
/// <summary>
|
|
/// Connects to MxAccess on the dedicated STA thread.
|
|
/// </summary>
|
|
public async Task ConnectAsync(CancellationToken ct = default)
|
|
{
|
|
if (_disposed) throw new ObjectDisposedException(nameof(MxAccessClient));
|
|
if (IsConnected) return;
|
|
|
|
SetState(ConnectionState.Connecting);
|
|
|
|
try
|
|
{
|
|
await _staThread.RunAsync(() => ConnectInternal());
|
|
|
|
lock (_lock)
|
|
{
|
|
_connectedSince = DateTime.UtcNow;
|
|
}
|
|
|
|
SetState(ConnectionState.Connected);
|
|
Log.Information("Connected to MxAccess (handle={Handle})", _connectionHandle);
|
|
|
|
// Recreate any stored subscriptions from a previous connection
|
|
await RecreateStoredSubscriptionsAsync();
|
|
|
|
// Start persistent probe subscription
|
|
await StartProbeSubscriptionAsync();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Error(ex, "Failed to connect to MxAccess");
|
|
await CleanupComObjectsAsync();
|
|
SetState(ConnectionState.Error, ex.Message);
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Disconnects from MxAccess on the dedicated STA thread.
|
|
/// </summary>
|
|
public async Task DisconnectAsync(CancellationToken ct = default)
|
|
{
|
|
if (!IsConnected) return;
|
|
|
|
SetState(ConnectionState.Disconnecting);
|
|
|
|
try
|
|
{
|
|
await _staThread.RunAsync(() => DisconnectInternal());
|
|
|
|
SetState(ConnectionState.Disconnected);
|
|
Log.Information("Disconnected from MxAccess");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Error(ex, "Error during disconnect");
|
|
SetState(ConnectionState.Error, ex.Message);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Starts the auto-reconnect monitor loop.
|
|
/// Call this after initial ConnectAsync succeeds.
|
|
/// </summary>
|
|
public void StartMonitorLoop()
|
|
{
|
|
if (!_autoReconnect) return;
|
|
|
|
_reconnectCts = new CancellationTokenSource();
|
|
Task.Run(() => MonitorConnectionAsync(_reconnectCts.Token));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stops the auto-reconnect monitor loop.
|
|
/// </summary>
|
|
public void StopMonitorLoop()
|
|
{
|
|
_reconnectCts?.Cancel();
|
|
}
|
|
|
|
/// <summary>Gets the UTC time when the connection was established.</summary>
|
|
public DateTime ConnectedSince
|
|
{
|
|
get { lock (_lock) { return _connectedSince; } }
|
|
}
|
|
|
|
// ── Internal synchronous methods ──────────
|
|
|
|
private void ConnectInternal()
|
|
{
|
|
lock (_lock)
|
|
{
|
|
// Create COM object
|
|
_lmxProxy = new LMXProxyServer();
|
|
|
|
// Wire event handlers
|
|
_lmxProxy.OnDataChange += OnDataChange;
|
|
_lmxProxy.OnWriteComplete += OnWriteComplete;
|
|
|
|
// Register with MxAccess using unique client name
|
|
_connectionHandle = _lmxProxy.Register(_clientName);
|
|
Log.Information("Registered with MxAccess as '{ClientName}'", _clientName);
|
|
|
|
if (_connectionHandle <= 0)
|
|
{
|
|
throw new InvalidOperationException("Failed to register with MxAccess - invalid handle returned");
|
|
}
|
|
}
|
|
}
|
|
|
|
private void DisconnectInternal()
|
|
{
|
|
lock (_lock)
|
|
{
|
|
if (_lmxProxy == null || _connectionHandle <= 0) return;
|
|
|
|
try
|
|
{
|
|
// Unadvise all active subscriptions before unregistering
|
|
foreach (var kvp in new Dictionary<string, int>(_addressToHandle))
|
|
{
|
|
try
|
|
{
|
|
_lmxProxy.UnAdvise(_connectionHandle, kvp.Value);
|
|
_lmxProxy.RemoveItem(_connectionHandle, kvp.Value);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Debug(ex, "Error removing subscription for {Address} during disconnect", kvp.Key);
|
|
}
|
|
}
|
|
|
|
// Remove event handlers
|
|
_lmxProxy.OnDataChange -= OnDataChange;
|
|
_lmxProxy.OnWriteComplete -= OnWriteComplete;
|
|
|
|
// Unregister
|
|
_lmxProxy.Unregister(_connectionHandle);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Warning(ex, "Error during MxAccess unregister");
|
|
}
|
|
finally
|
|
{
|
|
// Force-release COM object
|
|
try
|
|
{
|
|
Marshal.ReleaseComObject(_lmxProxy);
|
|
}
|
|
catch { }
|
|
|
|
_lmxProxy = null;
|
|
_connectionHandle = 0;
|
|
|
|
// Clear handle tracking (but keep _storedSubscriptions for reconnect)
|
|
_handleToAddress.Clear();
|
|
_addressToHandle.Clear();
|
|
_pendingWrites.Clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Subscribes to the configured probe test tag so that OnDataChange
|
|
/// callbacks update <see cref="_lastProbeValueTime"/>. Called after
|
|
/// connect (and reconnect). The subscription is stored for reconnect
|
|
/// replay like any other subscription.
|
|
/// </summary>
|
|
private async Task StartProbeSubscriptionAsync()
|
|
{
|
|
if (_probeTestTagAddress == null) return;
|
|
|
|
_lastProbeValueTime = DateTime.UtcNow;
|
|
|
|
await _staThread.RunAsync(() =>
|
|
{
|
|
lock (_lock)
|
|
{
|
|
if (!IsConnected || _lmxProxy == null) return;
|
|
|
|
// Subscribe (skips if already subscribed from reconnect replay)
|
|
SubscribeInternal(_probeTestTagAddress);
|
|
|
|
// Store a no-op callback — the real work happens in OnProbeDataChange
|
|
// which is called from OnDataChange before the stored callback
|
|
_storedSubscriptions[_probeTestTagAddress] = (_, __) => { };
|
|
}
|
|
});
|
|
|
|
Log.Information("Probe subscription started for {Tag} (stale threshold={ThresholdMs}ms)",
|
|
_probeTestTagAddress, _probeStaleThresholdMs);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Called from <see cref="OnDataChange"/> when a value arrives for the probe tag.
|
|
/// Updates the last-seen timestamp so the monitor loop can detect staleness.
|
|
/// </summary>
|
|
internal void OnProbeDataChange(string address, Vtq vtq)
|
|
{
|
|
_lastProbeValueTime = DateTime.UtcNow;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Auto-reconnect monitor loop with persistent subscription probe.
|
|
/// - If disconnected: attempt reconnect.
|
|
/// - If connected and probe configured: check time since last probe value update.
|
|
/// If stale beyond threshold, force disconnect and reconnect.
|
|
/// </summary>
|
|
private async Task MonitorConnectionAsync(CancellationToken ct)
|
|
{
|
|
Log.Information("Connection monitor loop started (interval={IntervalMs}ms, probe={ProbeEnabled}, staleThreshold={StaleMs}ms)",
|
|
_monitorIntervalMs, _probeTestTagAddress != null, _probeStaleThresholdMs);
|
|
|
|
while (!ct.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(_monitorIntervalMs, ct);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// -- Case 1: Already disconnected --
|
|
if (!IsConnected)
|
|
{
|
|
await AttemptReconnectAsync(ct);
|
|
continue;
|
|
}
|
|
|
|
// -- Case 2: Connected, no probe configured --
|
|
if (_probeTestTagAddress == null)
|
|
continue;
|
|
|
|
// -- Case 3: Connected, check probe staleness --
|
|
var elapsed = DateTime.UtcNow - _lastProbeValueTime;
|
|
if (elapsed.TotalMilliseconds > _probeStaleThresholdMs)
|
|
{
|
|
Log.Warning("Probe tag {Tag} stale for {ElapsedMs}ms (threshold={ThresholdMs}ms) — forcing reconnect",
|
|
_probeTestTagAddress, (int)elapsed.TotalMilliseconds, _probeStaleThresholdMs);
|
|
|
|
try
|
|
{
|
|
await DisconnectAsync(ct);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Warning(ex, "Error during forced disconnect before reconnect");
|
|
}
|
|
|
|
await AttemptReconnectAsync(ct);
|
|
}
|
|
}
|
|
|
|
Log.Information("Connection monitor loop exited");
|
|
}
|
|
|
|
private async Task AttemptReconnectAsync(CancellationToken ct)
|
|
{
|
|
Log.Information("Attempting reconnect...");
|
|
SetState(ConnectionState.Reconnecting);
|
|
|
|
try
|
|
{
|
|
await ConnectAsync(ct);
|
|
Log.Information("Reconnected to MxAccess successfully");
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
// Let the outer loop handle cancellation
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Warning(ex, "Reconnect attempt failed, will retry at next interval");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Cleans up COM objects on the dedicated STA thread after a failed connection.
|
|
/// </summary>
|
|
private async Task CleanupComObjectsAsync()
|
|
{
|
|
try
|
|
{
|
|
await _staThread.RunAsync(() =>
|
|
{
|
|
lock (_lock)
|
|
{
|
|
if (_lmxProxy != null)
|
|
{
|
|
try { _lmxProxy.OnDataChange -= OnDataChange; } catch { }
|
|
try { _lmxProxy.OnWriteComplete -= OnWriteComplete; } catch { }
|
|
try { Marshal.ReleaseComObject(_lmxProxy); } catch { }
|
|
_lmxProxy = null;
|
|
}
|
|
_connectionHandle = 0;
|
|
_handleToAddress.Clear();
|
|
_addressToHandle.Clear();
|
|
_pendingWrites.Clear();
|
|
}
|
|
});
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Warning(ex, "Error during COM object cleanup");
|
|
}
|
|
}
|
|
}
|
|
}
|