|
|
|
|
@@ -0,0 +1,673 @@
|
|
|
|
|
# Gap 1 & Gap 2: Active Health Probing + Subscription Handle Cleanup
|
|
|
|
|
|
|
|
|
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
|
|
|
|
|
|
|
|
|
**Goal:** Fix two reconnect-related gaps: (1) the monitor loop cannot detect a silently-dead MxAccess connection, and (2) SubscriptionManager holds stale IAsyncDisposable handles after reconnect.
|
|
|
|
|
|
|
|
|
|
**Architecture:** Add a domain-level connection probe to `MxAccessClient` that classifies results as Healthy/TransportFailure/DataDegraded. The monitor loop uses this to decide reconnect vs degrade-and-backoff. Separately, remove `SubscriptionManager._mxAccessHandles` entirely and switch to address-based unsubscribe through `IScadaClient`, making `MxAccessClient` the sole owner of COM subscription lifecycle.
|
|
|
|
|
|
|
|
|
|
**Tech Stack:** .NET Framework 4.8, C#, MxAccess COM interop, Serilog
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 0: Add `ProbeResult` domain type
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Create: `src/ZB.MOM.WW.LmxProxy.Host/Domain/ProbeResult.cs`
|
|
|
|
|
|
|
|
|
|
**Step 1: Create the ProbeResult type**
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
using System;
|
|
|
|
|
|
|
|
|
|
namespace ZB.MOM.WW.LmxProxy.Host.Domain
|
|
|
|
|
{
|
|
|
|
|
public enum ProbeStatus
|
|
|
|
|
{
|
|
|
|
|
Healthy,
|
|
|
|
|
TransportFailure,
|
|
|
|
|
DataDegraded
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public sealed class ProbeResult
|
|
|
|
|
{
|
|
|
|
|
public ProbeStatus Status { get; }
|
|
|
|
|
public Quality? Quality { get; }
|
|
|
|
|
public DateTime? Timestamp { get; }
|
|
|
|
|
public string? Message { get; }
|
|
|
|
|
public Exception? Exception { get; }
|
|
|
|
|
|
|
|
|
|
private ProbeResult(ProbeStatus status, Quality? quality, DateTime? timestamp,
|
|
|
|
|
string? message, Exception? exception)
|
|
|
|
|
{
|
|
|
|
|
Status = status;
|
|
|
|
|
Quality = quality;
|
|
|
|
|
Timestamp = timestamp;
|
|
|
|
|
Message = message;
|
|
|
|
|
Exception = exception;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static ProbeResult Healthy(Quality quality, DateTime timestamp)
|
|
|
|
|
=> new ProbeResult(ProbeStatus.Healthy, quality, timestamp, null, null);
|
|
|
|
|
|
|
|
|
|
public static ProbeResult Degraded(Quality quality, DateTime timestamp, string message)
|
|
|
|
|
=> new ProbeResult(ProbeStatus.DataDegraded, quality, timestamp, message, null);
|
|
|
|
|
|
|
|
|
|
public static ProbeResult TransportFailed(string message, Exception? ex = null)
|
|
|
|
|
=> new ProbeResult(ProbeStatus.TransportFailure, null, null, message, ex);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Domain/ProbeResult.cs
|
|
|
|
|
git commit -m "feat: add ProbeResult domain type for connection health classification"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 1: Add `ProbeConnectionAsync` to `MxAccessClient`
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/Domain/IScadaClient.cs` — add `ProbeConnectionAsync` to interface
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Connection.cs` — implement probe method
|
|
|
|
|
|
|
|
|
|
**Step 1: Add to IScadaClient interface**
|
|
|
|
|
|
|
|
|
|
In `IScadaClient.cs`, add after the `DisconnectAsync` method:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Probes connection health by reading a test tag.
|
|
|
|
|
/// Returns a classified result: Healthy, TransportFailure, or DataDegraded.
|
|
|
|
|
/// </summary>
|
|
|
|
|
Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs, CancellationToken ct = default);
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Implement in MxAccessClient.Connection.cs**
|
|
|
|
|
|
|
|
|
|
Add before `MonitorConnectionAsync`:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Probes the connection by reading a test tag with a timeout.
|
|
|
|
|
/// Classifies the result as transport failure vs data degraded.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public async Task<ProbeResult> ProbeConnectionAsync(string testTagAddress, int timeoutMs,
|
|
|
|
|
CancellationToken ct = default)
|
|
|
|
|
{
|
|
|
|
|
if (!IsConnected)
|
|
|
|
|
return ProbeResult.TransportFailed("Not connected");
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
using (var cts = CancellationTokenSource.CreateLinkedTokenSource(ct))
|
|
|
|
|
{
|
|
|
|
|
cts.CancelAfter(timeoutMs);
|
|
|
|
|
|
|
|
|
|
Vtq vtq;
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
vtq = await ReadAsync(testTagAddress, cts.Token);
|
|
|
|
|
}
|
|
|
|
|
catch (OperationCanceledException) when (!ct.IsCancellationRequested)
|
|
|
|
|
{
|
|
|
|
|
// Our timeout fired, not the caller's — treat as transport failure
|
|
|
|
|
return ProbeResult.TransportFailed("Probe read timed out after " + timeoutMs + "ms");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (vtq.Quality == Domain.Quality.Bad_NotConnected ||
|
|
|
|
|
vtq.Quality == Domain.Quality.Bad_CommFailure)
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.TransportFailed("Probe returned " + vtq.Quality);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!vtq.Quality.IsGood())
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
|
|
|
|
|
"Probe quality: " + vtq.Quality);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (DateTime.UtcNow - vtq.Timestamp > TimeSpan.FromMinutes(5))
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.Degraded(vtq.Quality, vtq.Timestamp,
|
|
|
|
|
"Probe data stale (>" + 5 + "min)");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return ProbeResult.Healthy(vtq.Quality, vtq.Timestamp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
catch (System.Runtime.InteropServices.COMException ex)
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.TransportFailed("COM exception: " + ex.Message, ex);
|
|
|
|
|
}
|
|
|
|
|
catch (InvalidOperationException ex) when (ex.Message.Contains("Not connected"))
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.TransportFailed(ex.Message, ex);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
return ProbeResult.TransportFailed("Probe failed: " + ex.Message, ex);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Domain/IScadaClient.cs
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Connection.cs
|
|
|
|
|
git commit -m "feat: add ProbeConnectionAsync to MxAccessClient for active health probing"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 2: Add health check configuration
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/Configuration/LmxProxyConfiguration.cs` — add HealthCheckConfiguration class and property
|
|
|
|
|
|
|
|
|
|
**Step 1: Add HealthCheckConfiguration**
|
|
|
|
|
|
|
|
|
|
Add a new class in the Configuration namespace (can be in the same file or a new file — keep it simple, same file):
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>Health check / probe configuration.</summary>
|
|
|
|
|
public class HealthCheckConfiguration
|
|
|
|
|
{
|
|
|
|
|
/// <summary>Tag address to probe for connection liveness. Default: TestChildObject.TestBool.</summary>
|
|
|
|
|
public string TestTagAddress { get; set; } = "TestChildObject.TestBool";
|
|
|
|
|
|
|
|
|
|
/// <summary>Probe timeout in milliseconds. Default: 5000.</summary>
|
|
|
|
|
public int ProbeTimeoutMs { get; set; } = 5000;
|
|
|
|
|
|
|
|
|
|
/// <summary>Consecutive transport failures before forced reconnect. Default: 3.</summary>
|
|
|
|
|
public int MaxConsecutiveTransportFailures { get; set; } = 3;
|
|
|
|
|
|
|
|
|
|
/// <summary>Probe interval while in degraded state (ms). Default: 30000 (30s).</summary>
|
|
|
|
|
public int DegradedProbeIntervalMs { get; set; } = 30000;
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Add to `LmxProxyConfiguration`:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>Health check / active probe settings.</summary>
|
|
|
|
|
public HealthCheckConfiguration HealthCheck { get; set; } = new HealthCheckConfiguration();
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Add to appsettings.json**
|
|
|
|
|
|
|
|
|
|
In the existing `appsettings.json`, add the `HealthCheck` section:
|
|
|
|
|
|
|
|
|
|
```json
|
|
|
|
|
"HealthCheck": {
|
|
|
|
|
"TestTagAddress": "TestChildObject.TestBool",
|
|
|
|
|
"ProbeTimeoutMs": 5000,
|
|
|
|
|
"MaxConsecutiveTransportFailures": 3,
|
|
|
|
|
"DegradedProbeIntervalMs": 30000
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Configuration/LmxProxyConfiguration.cs
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/appsettings.json
|
|
|
|
|
git commit -m "feat: add HealthCheck configuration section for active connection probing"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 3: Rewrite `MonitorConnectionAsync` with active probing
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.cs` — add probe state fields
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Connection.cs` — rewrite monitor loop
|
|
|
|
|
|
|
|
|
|
The monitor needs configuration passed in. The simplest approach: add constructor parameters for the probe settings alongside the existing ones.
|
|
|
|
|
|
|
|
|
|
**Step 1: Add probe fields to MxAccessClient.cs**
|
|
|
|
|
|
|
|
|
|
Add fields after the existing reconnect fields (around line 42):
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
// Probe configuration
|
|
|
|
|
private readonly string? _probeTestTagAddress;
|
|
|
|
|
private readonly int _probeTimeoutMs;
|
|
|
|
|
private readonly int _maxConsecutiveTransportFailures;
|
|
|
|
|
private readonly int _degradedProbeIntervalMs;
|
|
|
|
|
|
|
|
|
|
// Probe state
|
|
|
|
|
private int _consecutiveTransportFailures;
|
|
|
|
|
private bool _isDegraded;
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Add constructor parameters and assignments. After the existing `_galaxyName = galaxyName;` line:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
public MxAccessClient(
|
|
|
|
|
int maxConcurrentOperations = 10,
|
|
|
|
|
int readTimeoutSeconds = 5,
|
|
|
|
|
int writeTimeoutSeconds = 5,
|
|
|
|
|
int monitorIntervalSeconds = 5,
|
|
|
|
|
bool autoReconnect = true,
|
|
|
|
|
string? nodeName = null,
|
|
|
|
|
string? galaxyName = null,
|
|
|
|
|
string? probeTestTagAddress = null,
|
|
|
|
|
int probeTimeoutMs = 5000,
|
|
|
|
|
int maxConsecutiveTransportFailures = 3,
|
|
|
|
|
int degradedProbeIntervalMs = 30000)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
And in the body:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
_probeTestTagAddress = probeTestTagAddress;
|
|
|
|
|
_probeTimeoutMs = probeTimeoutMs;
|
|
|
|
|
_maxConsecutiveTransportFailures = maxConsecutiveTransportFailures;
|
|
|
|
|
_degradedProbeIntervalMs = degradedProbeIntervalMs;
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Rewrite MonitorConnectionAsync in MxAccessClient.Connection.cs**
|
|
|
|
|
|
|
|
|
|
Replace the existing `MonitorConnectionAsync` (lines 177-213) with:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Auto-reconnect monitor loop with active health probing.
|
|
|
|
|
/// - If IsConnected is false: immediate reconnect (existing behavior).
|
|
|
|
|
/// - If IsConnected is true and probe configured: read test tag each interval.
|
|
|
|
|
/// - TransportFailure for N consecutive probes → forced disconnect + reconnect.
|
|
|
|
|
/// - DataDegraded → stay connected, back off probe interval, report degraded.
|
|
|
|
|
/// - Healthy → reset counters and resume normal interval.
|
|
|
|
|
/// </summary>
|
|
|
|
|
private async Task MonitorConnectionAsync(CancellationToken ct)
|
|
|
|
|
{
|
|
|
|
|
Log.Information("Connection monitor loop started (interval={IntervalMs}ms, probe={ProbeEnabled})",
|
|
|
|
|
_monitorIntervalMs, _probeTestTagAddress != null);
|
|
|
|
|
|
|
|
|
|
while (!ct.IsCancellationRequested)
|
|
|
|
|
{
|
|
|
|
|
var interval = _isDegraded ? _degradedProbeIntervalMs : _monitorIntervalMs;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
await Task.Delay(interval, ct);
|
|
|
|
|
}
|
|
|
|
|
catch (OperationCanceledException)
|
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Case 1: Already disconnected ──
|
|
|
|
|
if (!IsConnected)
|
|
|
|
|
{
|
|
|
|
|
_isDegraded = false;
|
|
|
|
|
_consecutiveTransportFailures = 0;
|
|
|
|
|
await AttemptReconnectAsync(ct);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Case 2: Connected, no probe configured — legacy behavior ──
|
|
|
|
|
if (_probeTestTagAddress == null)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
// ── Case 3: Connected, probe configured — active health check ──
|
|
|
|
|
var probe = await ProbeConnectionAsync(_probeTestTagAddress, _probeTimeoutMs, ct);
|
|
|
|
|
|
|
|
|
|
switch (probe.Status)
|
|
|
|
|
{
|
|
|
|
|
case ProbeStatus.Healthy:
|
|
|
|
|
if (_isDegraded)
|
|
|
|
|
{
|
|
|
|
|
Log.Information("Probe healthy — exiting degraded mode");
|
|
|
|
|
_isDegraded = false;
|
|
|
|
|
}
|
|
|
|
|
_consecutiveTransportFailures = 0;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case ProbeStatus.DataDegraded:
|
|
|
|
|
_consecutiveTransportFailures = 0;
|
|
|
|
|
if (!_isDegraded)
|
|
|
|
|
{
|
|
|
|
|
Log.Warning("Probe degraded: {Message} — entering degraded mode (probe interval {IntervalMs}ms)",
|
|
|
|
|
probe.Message, _degradedProbeIntervalMs);
|
|
|
|
|
_isDegraded = true;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case ProbeStatus.TransportFailure:
|
|
|
|
|
_isDegraded = false;
|
|
|
|
|
_consecutiveTransportFailures++;
|
|
|
|
|
Log.Warning("Probe transport failure ({Count}/{Max}): {Message}",
|
|
|
|
|
_consecutiveTransportFailures, _maxConsecutiveTransportFailures, probe.Message);
|
|
|
|
|
|
|
|
|
|
if (_consecutiveTransportFailures >= _maxConsecutiveTransportFailures)
|
|
|
|
|
{
|
|
|
|
|
Log.Warning("Max consecutive transport failures reached — forcing reconnect");
|
|
|
|
|
_consecutiveTransportFailures = 0;
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
await DisconnectAsync(ct);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
Log.Warning(ex, "Error during forced disconnect before reconnect");
|
|
|
|
|
// DisconnectAsync already calls CleanupComObjectsAsync on error path
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await AttemptReconnectAsync(ct);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Log.Information("Connection monitor loop exited");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private async Task AttemptReconnectAsync(CancellationToken ct)
|
|
|
|
|
{
|
|
|
|
|
Log.Information("Attempting reconnect...");
|
|
|
|
|
SetState(ConnectionState.Reconnecting);
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
await ConnectAsync(ct);
|
|
|
|
|
Log.Information("Reconnected to MxAccess successfully");
|
|
|
|
|
}
|
|
|
|
|
catch (OperationCanceledException)
|
|
|
|
|
{
|
|
|
|
|
// Let the outer loop handle cancellation
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
Log.Warning(ex, "Reconnect attempt failed, will retry at next interval");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.cs
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Connection.cs
|
|
|
|
|
git commit -m "feat: rewrite monitor loop with active probing, transport vs degraded classification"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 4: Wire probe config through `LmxProxyService.Start()`
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/LmxProxyService.cs` — pass HealthCheck config to MxAccessClient constructor
|
|
|
|
|
|
|
|
|
|
**Step 1: Update MxAccessClient construction**
|
|
|
|
|
|
|
|
|
|
In `LmxProxyService.Start()`, update the MxAccessClient creation (around line 62) to pass the new parameters:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
_mxAccessClient = new MxAccessClient(
|
|
|
|
|
maxConcurrentOperations: _config.Connection.MaxConcurrentOperations,
|
|
|
|
|
readTimeoutSeconds: _config.Connection.ReadTimeoutSeconds,
|
|
|
|
|
writeTimeoutSeconds: _config.Connection.WriteTimeoutSeconds,
|
|
|
|
|
monitorIntervalSeconds: _config.Connection.MonitorIntervalSeconds,
|
|
|
|
|
autoReconnect: _config.Connection.AutoReconnect,
|
|
|
|
|
nodeName: _config.Connection.NodeName,
|
|
|
|
|
galaxyName: _config.Connection.GalaxyName,
|
|
|
|
|
probeTestTagAddress: _config.HealthCheck.TestTagAddress,
|
|
|
|
|
probeTimeoutMs: _config.HealthCheck.ProbeTimeoutMs,
|
|
|
|
|
maxConsecutiveTransportFailures: _config.HealthCheck.MaxConsecutiveTransportFailures,
|
|
|
|
|
degradedProbeIntervalMs: _config.HealthCheck.DegradedProbeIntervalMs);
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Update DetailedHealthCheckService to use shared probe**
|
|
|
|
|
|
|
|
|
|
In `LmxProxyService.Start()`, update the DetailedHealthCheckService construction (around line 114) to pass the test tag address from config:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
_detailedHealthCheckService = new DetailedHealthCheckService(
|
|
|
|
|
_mxAccessClient, _config.HealthCheck.TestTagAddress);
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/LmxProxyService.cs
|
|
|
|
|
git commit -m "feat: wire HealthCheck config to MxAccessClient and DetailedHealthCheckService"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 5: Add `UnsubscribeByAddressAsync` to `IScadaClient` and `MxAccessClient`
|
|
|
|
|
|
|
|
|
|
This is the foundation for removing handle-based unsubscribe from SubscriptionManager.
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/Domain/IScadaClient.cs` — add `UnsubscribeByAddressAsync`
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Subscription.cs` — implement, change `UnsubscribeAsync` visibility
|
|
|
|
|
|
|
|
|
|
**Step 1: Add to IScadaClient**
|
|
|
|
|
|
|
|
|
|
After `SubscribeAsync`:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Unsubscribes specific tag addresses. Removes from stored subscriptions
|
|
|
|
|
/// and COM state. Safe to call after reconnect — uses current handle mappings.
|
|
|
|
|
/// </summary>
|
|
|
|
|
Task UnsubscribeByAddressAsync(IEnumerable<string> addresses);
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Implement in MxAccessClient.Subscription.cs**
|
|
|
|
|
|
|
|
|
|
The existing `UnsubscribeAsync` (line 53) already does exactly this — it's just `internal`. Rename it or add a public wrapper:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Unsubscribes specific addresses by address name.
|
|
|
|
|
/// Removes from both COM state and stored subscriptions (no reconnect replay).
|
|
|
|
|
/// </summary>
|
|
|
|
|
public async Task UnsubscribeByAddressAsync(IEnumerable<string> addresses)
|
|
|
|
|
{
|
|
|
|
|
await UnsubscribeAsync(addresses);
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
This keeps the existing `internal UnsubscribeAsync` unchanged (it's still used by `SubscriptionHandle.DisposeAsync`).
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Domain/IScadaClient.cs
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/MxAccess/MxAccessClient.Subscription.cs
|
|
|
|
|
git commit -m "feat: add UnsubscribeByAddressAsync to IScadaClient for address-based unsubscribe"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 6: Remove `_mxAccessHandles` from `SubscriptionManager`
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/Subscriptions/SubscriptionManager.cs`
|
|
|
|
|
|
|
|
|
|
**Step 1: Remove `_mxAccessHandles` field**
|
|
|
|
|
|
|
|
|
|
Delete line 34-35:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
// REMOVE:
|
|
|
|
|
private readonly ConcurrentDictionary<string, IAsyncDisposable> _mxAccessHandles
|
|
|
|
|
= new ConcurrentDictionary<string, IAsyncDisposable>(StringComparer.OrdinalIgnoreCase);
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Rewrite `CreateMxAccessSubscriptionsAsync`**
|
|
|
|
|
|
|
|
|
|
The method no longer stores handles. It just calls `SubscribeAsync` to create the COM subscriptions. `MxAccessClient` stores them in `_storedSubscriptions` internally.
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
private async Task CreateMxAccessSubscriptionsAsync(List<string> addresses)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
await _scadaClient.SubscribeAsync(
|
|
|
|
|
addresses,
|
|
|
|
|
(address, vtq) => OnTagValueChanged(address, vtq));
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
Log.Error(ex, "Failed to create MxAccess subscriptions for {Count} tags", addresses.Count);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Rewrite unsubscribe logic in `UnsubscribeClient`**
|
|
|
|
|
|
|
|
|
|
Replace the handle disposal section (lines 198-212) with address-based unsubscribe:
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
// Unsubscribe tags with no remaining clients via address-based API
|
|
|
|
|
if (tagsToDispose.Count > 0)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
_scadaClient.UnsubscribeByAddressAsync(tagsToDispose).GetAwaiter().GetResult();
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
Log.Warning(ex, "Error unsubscribing {Count} tags from MxAccess", tagsToDispose.Count);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 4: Verify build**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
dotnet build src/ZB.MOM.WW.LmxProxy.Host
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Expected: Build succeeds. No references to `_mxAccessHandles` remain.
|
|
|
|
|
|
|
|
|
|
**Step 5: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Subscriptions/SubscriptionManager.cs
|
|
|
|
|
git commit -m "fix: remove _mxAccessHandles from SubscriptionManager, use address-based unsubscribe"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 7: Wire `ConnectionStateChanged` for reconnect notification in `SubscriptionManager`
|
|
|
|
|
|
|
|
|
|
After reconnect, `RecreateStoredSubscriptionsAsync` recreates COM subscriptions, and `SubscriptionManager` continues to receive `OnTagValueChanged` callbacks because the callback references are preserved in `_storedSubscriptions`. However, we should notify subscribed clients that quality has been restored.
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/Subscriptions/SubscriptionManager.cs` — add `NotifyReconnection` method
|
|
|
|
|
- Modify: `src/ZB.MOM.WW.LmxProxy.Host/LmxProxyService.cs` — wire Connected state to SubscriptionManager
|
|
|
|
|
|
|
|
|
|
**Step 1: Add `NotifyReconnection` to SubscriptionManager**
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Logs reconnection for observability. Data flow resumes automatically
|
|
|
|
|
/// via MxAccessClient.RecreateStoredSubscriptionsAsync callbacks.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public void NotifyReconnection()
|
|
|
|
|
{
|
|
|
|
|
Log.Information("MxAccess reconnected — subscriptions recreated, " +
|
|
|
|
|
"data flow will resume via OnDataChange callbacks " +
|
|
|
|
|
"({ClientCount} clients, {TagCount} tags)",
|
|
|
|
|
_clientSubscriptions.Count, _tagSubscriptions.Count);
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 2: Wire in LmxProxyService.Start()**
|
|
|
|
|
|
|
|
|
|
Extend the existing `ConnectionStateChanged` handler (around line 97):
|
|
|
|
|
|
|
|
|
|
```csharp
|
|
|
|
|
_mxAccessClient.ConnectionStateChanged += (sender, e) =>
|
|
|
|
|
{
|
|
|
|
|
if (e.CurrentState == Domain.ConnectionState.Disconnected ||
|
|
|
|
|
e.CurrentState == Domain.ConnectionState.Error)
|
|
|
|
|
{
|
|
|
|
|
_subscriptionManager.NotifyDisconnection();
|
|
|
|
|
}
|
|
|
|
|
else if (e.CurrentState == Domain.ConnectionState.Connected &&
|
|
|
|
|
e.PreviousState == Domain.ConnectionState.Reconnecting)
|
|
|
|
|
{
|
|
|
|
|
_subscriptionManager.NotifyReconnection();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
**Step 3: Commit**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/Subscriptions/SubscriptionManager.cs
|
|
|
|
|
git add src/ZB.MOM.WW.LmxProxy.Host/LmxProxyService.cs
|
|
|
|
|
git commit -m "feat: wire reconnection notification to SubscriptionManager for observability"
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Task 8: Build, deploy to windev, test
|
|
|
|
|
|
|
|
|
|
**Files:**
|
|
|
|
|
- No code changes — build and deployment task.
|
|
|
|
|
|
|
|
|
|
**Step 1: Build the solution**
|
|
|
|
|
|
|
|
|
|
```bash
|
|
|
|
|
dotnet build ZB.MOM.WW.LmxProxy.slnx
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
Expected: Clean build, no errors.
|
|
|
|
|
|
|
|
|
|
**Step 2: Deploy to windev**
|
|
|
|
|
|
|
|
|
|
Follow existing deployment procedure per `docker/README.md` or manual copy to windev.
|
|
|
|
|
|
|
|
|
|
**Step 3: Manual test — Gap 1 (active probing)**
|
|
|
|
|
|
|
|
|
|
1. Start the v2 service on windev. Verify logs show: `Connection monitor loop started (interval=5000ms, probe=True)`.
|
|
|
|
|
2. Verify probe runs: logs should show no warnings while platform is healthy.
|
|
|
|
|
3. Kill aaBootstrap on windev. Within 15-20s (3 probe failures at 5s intervals), logs should show:
|
|
|
|
|
- `Probe transport failure (1/3): Probe returned Bad_CommFailure` (or similar)
|
|
|
|
|
- `Probe transport failure (2/3): ...`
|
|
|
|
|
- `Probe transport failure (3/3): ...`
|
|
|
|
|
- `Max consecutive transport failures reached — forcing reconnect`
|
|
|
|
|
- `Attempting reconnect...`
|
|
|
|
|
4. After platform restart (but objects still stopped): Logs should show `Probe degraded` and `entering degraded mode`, then probe backs off to 30s interval. No reconnect churn.
|
|
|
|
|
5. After objects restart via SMC: Logs should show `Probe healthy — exiting degraded mode`.
|
|
|
|
|
|
|
|
|
|
**Step 4: Manual test — Gap 2 (subscription cleanup)**
|
|
|
|
|
|
|
|
|
|
1. Connect a gRPC client, subscribe to tags.
|
|
|
|
|
2. Kill aaBootstrap → client receives `Bad_NotConnected` quality.
|
|
|
|
|
3. Restart platform + objects. Verify client starts receiving Good quality updates again (via `RecreateStoredSubscriptionsAsync`).
|
|
|
|
|
4. Disconnect the client. Verify logs show `Unsubscribed from N tags` (address-based) with no handle disposal errors.
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
## Design Rationale
|
|
|
|
|
|
|
|
|
|
### Why two failure modes in the probe?
|
|
|
|
|
|
|
|
|
|
| Failure Mode | Cause | Correct Response |
|
|
|
|
|
|---|---|---|
|
|
|
|
|
| **Transport failure** | COM object dead, platform process crashed, MxAccess unreachable | Force disconnect + reconnect |
|
|
|
|
|
| **Data degraded** | COM session alive, AVEVA objects stopped, all reads return Bad quality | Stay connected, report degraded, back off probes |
|
|
|
|
|
|
|
|
|
|
Reconnecting on DataDegraded would churn COM objects with no benefit — the platform objects are stopped regardless of connection state. Observed: 40+ minutes of Bad quality after aaBootstrap crash until manual SMC restart.
|
|
|
|
|
|
|
|
|
|
### Why remove `_mxAccessHandles`?
|
|
|
|
|
|
|
|
|
|
1. **Batch handle bug**: `CreateMxAccessSubscriptionsAsync` stored the same `IAsyncDisposable` handle for every address in a batch. Disposing any one address disposed the entire batch, silently removing unrelated subscriptions from `_storedSubscriptions`.
|
|
|
|
|
2. **Stale after reconnect**: `RecreateStoredSubscriptionsAsync` recreates COM subscriptions but doesn't produce new `SubscriptionManager` handles. Old handles point to disposed COM state.
|
|
|
|
|
3. **Ownership violation**: `MxAccessClient` already owns subscription lifecycle via `_storedSubscriptions` and `_addressToHandle`. Duplicating ownership in `SubscriptionManager._mxAccessHandles` is a leaky abstraction.
|
|
|
|
|
|
|
|
|
|
The fix: `SubscriptionManager` owns client routing and ref counts only. `MxAccessClient` owns COM subscription lifecycle. Unsubscribe is by address, not by opaque handle.
|