fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 8 |
|
||||
| Open findings | 3 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -187,7 +187,7 @@ fail against the pre-fix logic and pass after.
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Akka.NET conventions |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:42`, `src/ScadaLink.Communication/Actors/SiteCommunicationActor.cs:22` |
|
||||
|
||||
**Description**
|
||||
@@ -212,7 +212,17 @@ strategy), matching the documented decision and other coordinator actors.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed: neither
|
||||
`CentralCommunicationActor` nor `SiteCommunicationActor` overrode `SupervisorStrategy`,
|
||||
so child faults fell back to the Akka default (`Restart`). Note that an actor's own
|
||||
`SupervisorStrategy` governs its *children* — a transient child fault would `Restart`
|
||||
the child and discard its in-memory state, contrary to the CLAUDE.md "Resume for
|
||||
coordinator actors" decision. Fix: both actors now override `SupervisorStrategy()` to
|
||||
return a `OneForOneStrategy` with an unbounded `Decider` resolving to `Directive.Resume`
|
||||
(mirroring `DataConnectionManagerActor`). Regression tests
|
||||
`CoordinatorSupervisionTests.CentralCommunicationActor_SupervisorStrategy_IsResume` and
|
||||
`CoordinatorSupervisionTests.SiteCommunicationActor_SupervisorStrategy_IsResume` fail
|
||||
against the pre-fix code (decider yields `Restart`) and pass after.
|
||||
|
||||
### Communication-005 — gRPC keepalive and max-stream-lifetime options are defined but never applied
|
||||
|
||||
@@ -220,7 +230,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Design-document adherence |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClient.cs:25`, `src/ScadaLink.Communication/CommunicationOptions.cs:36` |
|
||||
|
||||
**Description**
|
||||
@@ -249,7 +259,22 @@ option and update the design doc.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed: `SiteStreamGrpcClient`
|
||||
hard-coded the keepalive values, `GrpcMaxStreamLifetime` was referenced nowhere, and
|
||||
`GrpcMaxConcurrentStreams` was never bound to the server. Fix (scoped to
|
||||
`src/ScadaLink.Communication`): `SiteStreamGrpcClient` gained a constructor taking
|
||||
`CommunicationOptions` and now applies `GrpcKeepAlivePingDelay`/`GrpcKeepAlivePingTimeout`
|
||||
to its `SocketsHttpHandler`; `SiteStreamGrpcClientFactory` gained an
|
||||
`IOptions<CommunicationOptions>` DI constructor and flows the options into every client
|
||||
it creates; `SiteStreamGrpcServer` gained an `IOptions<CommunicationOptions>` DI
|
||||
constructor that binds `GrpcMaxConcurrentStreams` and implements the documented 4-hour
|
||||
session timeout via `CancellationTokenSource.CancelAfter(GrpcMaxStreamLifetime)` on the
|
||||
per-stream CTS. The Host's existing `AddSingleton<SiteStreamGrpcServer>()` registration
|
||||
resolves the new DI constructor via greedy resolution — no Host change required.
|
||||
Regression tests `GrpcOptionsWiringTests.SiteStreamGrpcClient_AppliesKeepAliveFromOptions`,
|
||||
`GrpcOptionsWiringTests.SiteStreamGrpcClientFactory_FlowsOptionsToCreatedClients`, and
|
||||
`GrpcOptionsWiringTests.SiteStreamGrpcServer_BindsMaxConcurrentStreamsAndLifetimeFromOptions`
|
||||
exercise the wiring (they require the new members to even compile).
|
||||
|
||||
### Communication-006 — Site address load failures are silently swallowed, leaving a stale cache
|
||||
|
||||
@@ -257,7 +282,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs:204` |
|
||||
|
||||
**Description**
|
||||
@@ -279,7 +304,16 @@ is down". Optionally surface a health metric for repeated load failures.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed: a faulted
|
||||
`LoadSiteAddressesFromDb` task is piped to `Self` as a `Status.Failure`, but the actor
|
||||
had no handler for it — the failure became an unhandled message (debug-level only) and
|
||||
the periodic refresh failed silently. Fix: added a `Receive<Status.Failure>` handler
|
||||
that logs the load failure at `Warning` with the underlying exception as the cause, so
|
||||
operators can distinguish a missing-addresses configuration from a database outage.
|
||||
Regression test
|
||||
`CentralCommunicationActorTests.LoadSiteAddressesFailure_IsLoggedNotSilentlySwallowed`
|
||||
(repository query throws) asserts the Warning is emitted — it produces no warning
|
||||
against the pre-fix code and passes after.
|
||||
|
||||
### Communication-007 — `SiteStreamGrpcClientFactory.Dispose` blocks on async work (sync-over-async)
|
||||
|
||||
@@ -287,7 +321,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Performance & resource management |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.Communication/Grpc/SiteStreamGrpcClientFactory.cs:53` |
|
||||
|
||||
**Description**
|
||||
@@ -308,7 +342,17 @@ path, or document why blocking is safe here.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed: `Dispose()` called
|
||||
`DisposeAsync().AsTask().GetAwaiter().GetResult()`, the classic sync-over-async pattern.
|
||||
Fix: `SiteStreamGrpcClient` now also implements `IDisposable` with a synchronous
|
||||
`Dispose()` that releases its CancellationTokenSources and underlying `GrpcChannel`
|
||||
directly (all of that teardown is inherently synchronous); `SiteStreamGrpcClientFactory.Dispose()`
|
||||
now disposes each cached client via that synchronous path with no blocking on the async
|
||||
path. A `CreateClient` seam was extracted so the test can substitute a tracking client
|
||||
while still exercising the factory's real caching/disposal machinery. Regression test
|
||||
`SiteStreamGrpcClientFactoryDisposeTests.Dispose_DisposesClientsSynchronously_NotViaAsyncPath`
|
||||
fails against the pre-fix code (clients disposed via `DisposeAsync`) and passes after;
|
||||
`Dispose_DoesNotDeadlock_UnderSingleThreadedSynchronizationContext` guards the stall path.
|
||||
|
||||
### Communication-008 — Reconnect retry-count reset can mask a flapping stream indefinitely
|
||||
|
||||
@@ -316,7 +360,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:71`, `src/ScadaLink.Communication/Actors/DebugStreamBridgeActor.cs:174` |
|
||||
|
||||
**Description**
|
||||
@@ -339,7 +383,21 @@ reconnects regardless of intervening events.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed: `_retryCount` was reset to
|
||||
0 on every received `AttributeValueChanged`/`AlarmStateChanged`, so a stream that
|
||||
connected, delivered one event, then failed — repeatedly — never tripped `MaxRetries`.
|
||||
Fix (recommendation option a): the per-event reset was removed; instead `OpenGrpcStream`
|
||||
arms a single `StabilityWindow` timer (60s default, internal-settable for tests), and
|
||||
only when it fires (`GrpcStreamStable`) — i.e. the stream stayed up long enough to be
|
||||
considered recovered — is `_retryCount` reset. `HandleGrpcError` cancels that timer, so
|
||||
a stream that fails before the window elapses does not recover its retry budget. A
|
||||
flapping stream therefore terminates after `MaxRetries` regardless of intervening
|
||||
events. Regression test
|
||||
`DebugStreamBridgeActorTests.FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries`
|
||||
fails against the pre-fix code (actor never terminates) and passes after;
|
||||
`RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow` verifies the budget
|
||||
is recovered after a stable interval. The pre-existing test that codified the buggy
|
||||
per-event reset (`Grpc_Error_Resets_RetryCount_On_Successful_Event`) was replaced.
|
||||
|
||||
### Communication-009 — `_siteClients` field is mutable and reassignable; cache update is not atomic on failure
|
||||
|
||||
|
||||
@@ -84,6 +84,15 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
// Periodic refresh trigger
|
||||
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
|
||||
|
||||
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
|
||||
// Status.Failure. Without this handler the failure was an unhandled message
|
||||
// (debug-level only) and the refresh failed silently — operators could not
|
||||
// distinguish "no sites configured" from "database is down". Log at Warning.
|
||||
Receive<Status.Failure>(failure =>
|
||||
_log.Warning(failure.Cause,
|
||||
"Failed to load site addresses from the database; the site ClusterClient "
|
||||
+ "cache was not refreshed and may be stale or empty"));
|
||||
|
||||
// Health monitoring: heartbeats and health reports from sites
|
||||
Receive<HeartbeatMessage>(HandleHeartbeat);
|
||||
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
||||
@@ -296,6 +305,25 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
|
||||
/// CentralCommunicationActor is a long-lived coordinator that owns the per-site
|
||||
/// ClusterClient map; a transient fault in a child (e.g. a ClusterClient child)
|
||||
/// must Resume so the child's connection state is preserved rather than wiped by
|
||||
/// a Restart.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("CentralCommunicationActor started");
|
||||
|
||||
@@ -28,7 +28,19 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
|
||||
private const int MaxRetries = 3;
|
||||
private const string ReconnectTimerKey = "grpc-reconnect";
|
||||
private const string StabilityTimerKey = "grpc-stability";
|
||||
internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// How long a freshly-opened gRPC stream must stay up before its retry budget
|
||||
/// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
|
||||
/// Communication-008: the retry count must NOT be reset by individual events —
|
||||
/// a stream that connects, delivers one event, then fails repeatedly would
|
||||
/// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
|
||||
/// only after a stable interval bounds a flapping stream.
|
||||
/// </summary>
|
||||
internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
|
||||
|
||||
private int _retryCount;
|
||||
private bool _useNodeA = true;
|
||||
private bool _stopped;
|
||||
@@ -66,16 +78,21 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
OpenGrpcStream();
|
||||
});
|
||||
|
||||
// Domain events arriving via Self.Tell from gRPC callback
|
||||
Receive<AttributeValueChanged>(changed =>
|
||||
{
|
||||
_retryCount = 0; // Successful event resets retry count
|
||||
_onEvent(changed);
|
||||
});
|
||||
Receive<AlarmStateChanged>(changed =>
|
||||
// Domain events arriving via Self.Tell from gRPC callback.
|
||||
// Communication-008: receiving an event must NOT reset _retryCount — a
|
||||
// flapping stream that delivers a single event between failures would
|
||||
// otherwise never trip MaxRetries. The retry budget is recovered only by
|
||||
// GrpcStreamStable (a stream that has stayed up for StabilityWindow).
|
||||
Receive<AttributeValueChanged>(changed => _onEvent(changed));
|
||||
Receive<AlarmStateChanged>(changed => _onEvent(changed));
|
||||
|
||||
// Stream has been stably connected for StabilityWindow — recover the
|
||||
// retry budget so a future transient fault gets a fresh set of retries.
|
||||
Receive<GrpcStreamStable>(_ =>
|
||||
{
|
||||
if (_stopped) return;
|
||||
_retryCount = 0;
|
||||
_onEvent(changed);
|
||||
_log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
|
||||
});
|
||||
|
||||
// gRPC stream error — attempt reconnection
|
||||
@@ -151,6 +168,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
_grpcCts?.Dispose();
|
||||
_grpcCts = new CancellationTokenSource();
|
||||
|
||||
// Arm the stability timer: if the stream stays up for StabilityWindow the
|
||||
// retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
|
||||
Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
|
||||
|
||||
var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
|
||||
var self = Self;
|
||||
var ct = _grpcCts.Token;
|
||||
@@ -171,6 +192,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
if (_stopped) return;
|
||||
|
||||
// The stream failed before reaching the stability window — its retry
|
||||
// budget is NOT recovered (Communication-008).
|
||||
Timers.Cancel(StabilityTimerKey);
|
||||
|
||||
_retryCount++;
|
||||
|
||||
if (_retryCount > MaxRetries)
|
||||
@@ -239,3 +264,10 @@ internal record GrpcStreamError(Exception Exception);
|
||||
/// Internal message to trigger gRPC stream reconnection.
|
||||
/// </summary>
|
||||
internal record ReconnectGrpcStream;
|
||||
|
||||
/// <summary>
|
||||
/// Internal message indicating the current gRPC stream has been connected long
|
||||
/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
|
||||
/// stable, so the reconnect retry budget can be recovered.
|
||||
/// </summary>
|
||||
internal record GrpcStreamStable;
|
||||
|
||||
@@ -177,6 +177,24 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
|
||||
/// SiteCommunicationActor is a long-lived coordinator routing all message
|
||||
/// patterns to local handlers; a transient fault in a child must Resume so the
|
||||
/// child's in-memory state is preserved rather than discarded by a Restart.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("SiteCommunicationActor started for site {0}", _siteId);
|
||||
|
||||
@@ -13,21 +13,45 @@ namespace ScadaLink.Communication.Grpc;
|
||||
/// SiteStreamGrpcServer. The central-side DebugStreamBridgeActor uses this
|
||||
/// to open server-streaming calls for individual instances.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
|
||||
{
|
||||
private readonly GrpcChannel? _channel;
|
||||
private readonly SiteStreamService.SiteStreamServiceClient? _client;
|
||||
private readonly ILogger? _logger;
|
||||
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
|
||||
|
||||
/// <summary>
|
||||
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
|
||||
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
|
||||
/// </summary>
|
||||
internal TimeSpan KeepAlivePingDelay { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The HTTP/2 keepalive ping timeout actually applied to this client's channel.
|
||||
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
|
||||
/// </summary>
|
||||
internal TimeSpan KeepAlivePingTimeout { get; }
|
||||
|
||||
public SiteStreamGrpcClient(string endpoint, ILogger logger)
|
||||
: this(endpoint, logger, new CommunicationOptions())
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a client whose HTTP/2 keepalive is taken from <see cref="CommunicationOptions"/>
|
||||
/// rather than hard-coded, satisfying the design doc's "gRPC Connection Keepalive"
|
||||
/// section which states these values are configurable.
|
||||
/// </summary>
|
||||
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
|
||||
{
|
||||
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
|
||||
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
|
||||
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
|
||||
{
|
||||
HttpHandler = new SocketsHttpHandler
|
||||
{
|
||||
KeepAlivePingDelay = TimeSpan.FromSeconds(15),
|
||||
KeepAlivePingTimeout = TimeSpan.FromSeconds(10),
|
||||
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay,
|
||||
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout,
|
||||
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
|
||||
}
|
||||
});
|
||||
@@ -205,7 +229,13 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
_ => AlarmLevel.None
|
||||
};
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
/// <summary>
|
||||
/// Releases all subscription CancellationTokenSources and the underlying
|
||||
/// gRPC channel. All teardown here is synchronous (CTS disposal and
|
||||
/// <see cref="GrpcChannel.Dispose"/>), so a synchronous <see cref="Dispose"/>
|
||||
/// can release everything without sync-over-async blocking.
|
||||
/// </summary>
|
||||
private void ReleaseResources()
|
||||
{
|
||||
foreach (var cts in _subscriptions.Values)
|
||||
{
|
||||
@@ -214,9 +244,22 @@ public class SiteStreamGrpcClient : IAsyncDisposable
|
||||
}
|
||||
_subscriptions.Clear();
|
||||
|
||||
if (_channel is not null)
|
||||
_channel.Dispose();
|
||||
_channel?.Dispose();
|
||||
}
|
||||
|
||||
await ValueTask.CompletedTask;
|
||||
public virtual ValueTask DisposeAsync()
|
||||
{
|
||||
ReleaseResources();
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Synchronous disposal. All resources held by this client are released
|
||||
/// synchronously, so callers (e.g. <see cref="SiteStreamGrpcClientFactory.Dispose"/>)
|
||||
/// need not block on the async disposal path.
|
||||
/// </summary>
|
||||
public virtual void Dispose()
|
||||
{
|
||||
ReleaseResources();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.Communication.Grpc;
|
||||
|
||||
@@ -12,22 +13,43 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, SiteStreamGrpcClient> _clients = new();
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly CommunicationOptions _options;
|
||||
|
||||
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory)
|
||||
: this(loggerFactory, Options.Create(new CommunicationOptions()))
|
||||
{
|
||||
_loggerFactory = loggerFactory;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns an existing client for the site or creates a new one.
|
||||
/// DI constructor — flows <see cref="CommunicationOptions"/> into every created
|
||||
/// <see cref="SiteStreamGrpcClient"/> so the configured gRPC keepalive settings
|
||||
/// are applied rather than hard-coded defaults.
|
||||
/// </summary>
|
||||
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory, IOptions<CommunicationOptions> options)
|
||||
{
|
||||
_loggerFactory = loggerFactory;
|
||||
_options = options.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns an existing client for the site or creates a new one. The new
|
||||
/// client is created via <see cref="CreateClient"/> and tracked so the
|
||||
/// factory's <see cref="Dispose"/> / <see cref="DisposeAsync"/> release it.
|
||||
/// </summary>
|
||||
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
|
||||
{
|
||||
return _clients.GetOrAdd(siteIdentifier, _ =>
|
||||
{
|
||||
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
|
||||
return new SiteStreamGrpcClient(grpcEndpoint, logger);
|
||||
});
|
||||
return _clients.GetOrAdd(siteIdentifier, _ => CreateClient(grpcEndpoint));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a single <see cref="SiteStreamGrpcClient"/>. Overridable so tests
|
||||
/// can substitute a tracking client while still exercising the factory's real
|
||||
/// caching and disposal machinery.
|
||||
/// </summary>
|
||||
protected virtual SiteStreamGrpcClient CreateClient(string grpcEndpoint)
|
||||
{
|
||||
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
|
||||
return new SiteStreamGrpcClient(grpcEndpoint, logger, _options);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -50,8 +72,19 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
|
||||
_clients.Clear();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Synchronous disposal. Communication-007: this used to block on
|
||||
/// <c>DisposeAsync().AsTask().GetAwaiter().GetResult()</c> (sync-over-async,
|
||||
/// a stall/deadlock risk during host shutdown). Each
|
||||
/// <see cref="SiteStreamGrpcClient"/> releases all of its resources
|
||||
/// synchronously, so we dispose them directly with no async path.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
foreach (var client in _clients.Values)
|
||||
{
|
||||
client.Dispose();
|
||||
}
|
||||
_clients.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ using System.Threading.Channels;
|
||||
using Akka.Actor;
|
||||
using Grpc.Core;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using GrpcStatus = Grpc.Core.Status;
|
||||
|
||||
namespace ScadaLink.Communication.Grpc;
|
||||
@@ -19,6 +20,7 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
private readonly ILogger<SiteStreamGrpcServer> _logger;
|
||||
private readonly ConcurrentDictionary<string, StreamEntry> _activeStreams = new();
|
||||
private readonly int _maxConcurrentStreams;
|
||||
private readonly TimeSpan _maxStreamLifetime;
|
||||
private volatile bool _ready;
|
||||
private long _actorCounter;
|
||||
|
||||
@@ -26,10 +28,36 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
int maxConcurrentStreams = 100)
|
||||
: this(streamSubscriber, logger, maxConcurrentStreams, TimeSpan.FromHours(4))
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DI constructor — binds <see cref="CommunicationOptions.GrpcMaxConcurrentStreams"/>
|
||||
/// and <see cref="CommunicationOptions.GrpcMaxStreamLifetime"/> so the documented
|
||||
/// concurrency limit and the 4-hour zombie-stream session timeout are honoured
|
||||
/// rather than hard-coded.
|
||||
/// </summary>
|
||||
public SiteStreamGrpcServer(
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
IOptions<CommunicationOptions> options)
|
||||
: this(streamSubscriber, logger,
|
||||
options.Value.GrpcMaxConcurrentStreams,
|
||||
options.Value.GrpcMaxStreamLifetime)
|
||||
{
|
||||
}
|
||||
|
||||
private SiteStreamGrpcServer(
|
||||
ISiteStreamSubscriber streamSubscriber,
|
||||
ILogger<SiteStreamGrpcServer> logger,
|
||||
int maxConcurrentStreams,
|
||||
TimeSpan maxStreamLifetime)
|
||||
{
|
||||
_streamSubscriber = streamSubscriber;
|
||||
_logger = logger;
|
||||
_maxConcurrentStreams = maxConcurrentStreams;
|
||||
_maxStreamLifetime = maxStreamLifetime;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -49,6 +77,12 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
/// </summary>
|
||||
public int ActiveStreamCount => _activeStreams.Count;
|
||||
|
||||
/// <summary>Effective max concurrent stream limit. Exposed for tests.</summary>
|
||||
internal int MaxConcurrentStreams => _maxConcurrentStreams;
|
||||
|
||||
/// <summary>Effective per-stream session lifetime. Exposed for tests.</summary>
|
||||
internal TimeSpan MaxStreamLifetime => _maxStreamLifetime;
|
||||
|
||||
public override async Task SubscribeInstance(
|
||||
InstanceStreamRequest request,
|
||||
IServerStreamWriter<SiteStreamEvent> responseStream,
|
||||
@@ -69,6 +103,11 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
throw new RpcException(new GrpcStatus(StatusCode.ResourceExhausted, "Max concurrent streams reached"));
|
||||
|
||||
using var streamCts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
|
||||
// Session timeout (design doc "gRPC Connection Keepalive": 4-hour third layer
|
||||
// of dead-client detection) — forces a long-lived zombie stream to terminate
|
||||
// even if keepalive PINGs never detect the loss.
|
||||
if (_maxStreamLifetime > TimeSpan.Zero && _maxStreamLifetime != Timeout.InfiniteTimeSpan)
|
||||
streamCts.CancelAfter(_maxStreamLifetime);
|
||||
var entry = new StreamEntry(streamCts);
|
||||
_activeStreams[request.CorrelationId] = entry;
|
||||
|
||||
|
||||
@@ -197,6 +197,31 @@ public class CentralCommunicationActorTests : TestKit
|
||||
Assert.Equal("dep2", ((DeployInstanceCommand)msg2.Message).DeploymentId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LoadSiteAddressesFailure_IsLoggedNotSilentlySwallowed()
|
||||
{
|
||||
// Regression test for Communication-006. When the repository query throws,
|
||||
// PipeTo delivers a Status.Failure to the actor. Without a Receive<Status.Failure>
|
||||
// handler the failure becomes an unhandled message (debug-level only) and the
|
||||
// periodic refresh fails silently — operators cannot tell "no addresses
|
||||
// configured" from "database is down". The fix logs the failure at Warning.
|
||||
var mockRepo = Substitute.For<ISiteRepository>();
|
||||
mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
|
||||
.Returns<Task<IReadOnlyList<Site>>>(_ => throw new InvalidOperationException("database is down"));
|
||||
|
||||
var services = new ServiceCollection();
|
||||
services.AddScoped(_ => mockRepo);
|
||||
var sp = services.BuildServiceProvider();
|
||||
|
||||
var mockFactory = Substitute.For<ISiteClientFactory>();
|
||||
|
||||
// The fix logs a Warning carrying the InvalidOperationException as the cause.
|
||||
EventFilter.Warning(contains: "Failed to load site addresses from the database").ExpectOne(() =>
|
||||
{
|
||||
Sys.ActorOf(Props.Create(() => new CentralCommunicationActor(sp, mockFactory)));
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BothContactPoints_UsedInSingleClient()
|
||||
{
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
using Akka.Actor;
|
||||
using Akka.TestKit.Xunit2;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using NSubstitute;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Communication.Actors;
|
||||
|
||||
namespace ScadaLink.Communication.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Regression tests for Communication-004 — coordinator actors must declare an
|
||||
/// explicit <c>Resume</c> supervision strategy per the CLAUDE.md decision
|
||||
/// ("Resume for coordinator actors"). A child fault under the default
|
||||
/// (Restart) strategy would wipe a child's in-memory state; the long-lived
|
||||
/// coordinators own per-site ClusterClients and must not silently discard
|
||||
/// their children on a transient fault.
|
||||
/// </summary>
|
||||
public class CoordinatorSupervisionTests : TestKit
|
||||
{
|
||||
/// <summary>
|
||||
/// Test-only subclass that exposes the protected <see cref="SupervisorStrategy"/>
|
||||
/// so the configured directive can be asserted directly.
|
||||
/// </summary>
|
||||
private sealed class CentralCommunicationActorProbe : CentralCommunicationActor
|
||||
{
|
||||
public CentralCommunicationActorProbe(IServiceProvider sp, ISiteClientFactory factory)
|
||||
: base(sp, factory) { }
|
||||
|
||||
public SupervisorStrategy GetSupervisorStrategy() => SupervisorStrategy();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test-only subclass that exposes the protected <see cref="SupervisorStrategy"/>.
|
||||
/// </summary>
|
||||
private sealed class SiteCommunicationActorProbe : SiteCommunicationActor
|
||||
{
|
||||
public SiteCommunicationActorProbe(string siteId, CommunicationOptions options, IActorRef dm)
|
||||
: base(siteId, options, dm) { }
|
||||
|
||||
public SupervisorStrategy GetSupervisorStrategy() => SupervisorStrategy();
|
||||
}
|
||||
|
||||
private static IServiceProvider EmptyServiceProvider()
|
||||
{
|
||||
var mockRepo = Substitute.For<ISiteRepository>();
|
||||
mockRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<Commons.Entities.Sites.Site>());
|
||||
var services = new ServiceCollection();
|
||||
services.AddScoped(_ => mockRepo);
|
||||
return services.BuildServiceProvider();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CentralCommunicationActor_SupervisorStrategy_IsResume()
|
||||
{
|
||||
var sp = EmptyServiceProvider();
|
||||
var factory = Substitute.For<ISiteClientFactory>();
|
||||
|
||||
var actorRef = new Akka.TestKit.TestActorRef<CentralCommunicationActorProbe>(
|
||||
Sys, Props.Create(() => new CentralCommunicationActorProbe(sp, factory)));
|
||||
|
||||
var strategy = actorRef.UnderlyingActor.GetSupervisorStrategy();
|
||||
|
||||
var oneForOne = Assert.IsType<OneForOneStrategy>(strategy);
|
||||
var directive = oneForOne.Decider.Decide(new InvalidOperationException("transient child fault"));
|
||||
Assert.Equal(Directive.Resume, directive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SiteCommunicationActor_SupervisorStrategy_IsResume()
|
||||
{
|
||||
var dmProbe = CreateTestProbe();
|
||||
|
||||
var actorRef = new Akka.TestKit.TestActorRef<SiteCommunicationActorProbe>(
|
||||
Sys, Props.Create(() => new SiteCommunicationActorProbe("site1", new CommunicationOptions(), dmProbe.Ref)));
|
||||
|
||||
var strategy = actorRef.UnderlyingActor.GetSupervisorStrategy();
|
||||
|
||||
var oneForOne = Assert.IsType<OneForOneStrategy>(strategy);
|
||||
var directive = oneForOne.Decider.Decide(new InvalidOperationException("transient child fault"));
|
||||
Assert.Equal(Directive.Resume, directive);
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,9 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
{
|
||||
// Use a very short reconnect delay for testing
|
||||
DebugStreamBridgeActor.ReconnectDelay = TimeSpan.FromMilliseconds(100);
|
||||
// Long stability window so streams are never considered "stable" mid-test
|
||||
// unless a test deliberately waits it out.
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
private record TestContext(
|
||||
@@ -264,8 +267,13 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Grpc_Error_Resets_RetryCount_On_Successful_Event()
|
||||
public void FlappingStream_DeliveringEventsBetweenFailures_StillTerminatesAfterMaxRetries()
|
||||
{
|
||||
// Communication-008 regression: a stream that connects, delivers an event,
|
||||
// then fails — repeatedly — must still trip MaxRetries. The retry count is
|
||||
// NO LONGER reset by a received event (only by the stability window). The
|
||||
// previous behaviour reset _retryCount on every event, so a flapping site
|
||||
// reconnected forever and the debug session lived on indefinitely.
|
||||
var ctx = CreateBridgeActor();
|
||||
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
|
||||
|
||||
@@ -275,30 +283,72 @@ public class DebugStreamBridgeActorTests : TestKit
|
||||
new List<AlarmStateChanged>(),
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
Watch(ctx.BridgeActor);
|
||||
ctx.BridgeActor.Tell(snapshot);
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
|
||||
|
||||
// First error → retry 1
|
||||
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("Error 1"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
|
||||
|
||||
// Simulate successful event (resets retry count)
|
||||
var attrChange = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", DateTimeOffset.UtcNow);
|
||||
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(attrChange);
|
||||
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
|
||||
TimeSpan.FromSeconds(3));
|
||||
|
||||
// Now another 3 errors should be tolerated (retry count was reset)
|
||||
ctx.MockGrpcClient.SubscribeCalls[1].OnError(new Exception("Error 2"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 3, TimeSpan.FromSeconds(5));
|
||||
// Flap: deliver one event then fail, three times. Each event would, under
|
||||
// the old buggy logic, reset the retry budget and prevent termination.
|
||||
for (var i = 0; i < 3; i++)
|
||||
{
|
||||
var call = ctx.MockGrpcClient.SubscribeCalls[i];
|
||||
call.OnEvent(attrChange);
|
||||
call.OnError(new Exception($"Flap {i + 1}"));
|
||||
var expected = i + 2;
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
ctx.MockGrpcClient.SubscribeCalls[2].OnError(new Exception("Error 3"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 4, TimeSpan.FromSeconds(5));
|
||||
// Fourth error (after the 3 retries) must exceed MaxRetries and terminate.
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnEvent(attrChange);
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Flap 4"));
|
||||
|
||||
ctx.MockGrpcClient.SubscribeCalls[3].OnError(new Exception("Error 4"));
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 5, TimeSpan.FromSeconds(5));
|
||||
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(5));
|
||||
Assert.True(ctx.TerminatedFlag[0]);
|
||||
}
|
||||
|
||||
// Still alive — 3 retries from the second failure point succeeded
|
||||
[Fact]
|
||||
public void RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow()
|
||||
{
|
||||
// Communication-008: after a stream has been connected for the stability
|
||||
// window, the retry budget is recovered — a later transient fault then gets
|
||||
// a fresh set of retries rather than being counted against the old budget.
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromMilliseconds(300);
|
||||
try
|
||||
{
|
||||
var ctx = CreateBridgeActor();
|
||||
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
|
||||
|
||||
var snapshot = new DebugViewSnapshot(
|
||||
InstanceName,
|
||||
new List<AttributeValueChanged>(),
|
||||
new List<AlarmStateChanged>(),
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
Watch(ctx.BridgeActor);
|
||||
ctx.BridgeActor.Tell(snapshot);
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
|
||||
|
||||
// Two failures — but each new stream stays up long enough (the mock
|
||||
// stream only completes on cancel) for the stability window to elapse
|
||||
// and reset the retry budget before the next failure.
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
Thread.Sleep(450); // exceed the 300ms stability window
|
||||
ctx.MockGrpcClient.SubscribeCalls[i].OnError(new Exception($"Error {i + 1}"));
|
||||
var expected = i + 2;
|
||||
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == expected, TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
// Five well-spaced failures did NOT terminate the actor because each
|
||||
// reconnect recovered its retry budget after the stability window.
|
||||
Assert.False(ctx.TerminatedFlag[0]);
|
||||
}
|
||||
finally
|
||||
{
|
||||
DebugStreamBridgeActor.StabilityWindow = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using ScadaLink.Communication;
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.Communication.Tests.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Regression tests for Communication-005 — the gRPC keepalive and
|
||||
/// max-stream-lifetime / max-concurrent-stream options defined on
|
||||
/// <see cref="CommunicationOptions"/> must actually be applied to the
|
||||
/// gRPC client and server rather than hard-coded.
|
||||
/// </summary>
|
||||
public class GrpcOptionsWiringTests
|
||||
{
|
||||
[Fact]
|
||||
public void SiteStreamGrpcClient_AppliesKeepAliveFromOptions()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(42),
|
||||
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(7)
|
||||
};
|
||||
|
||||
var client = new SiteStreamGrpcClient(
|
||||
"http://localhost:9999", NullLogger<SiteStreamGrpcClient>.Instance, options);
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(42), client.KeepAlivePingDelay);
|
||||
Assert.Equal(TimeSpan.FromSeconds(7), client.KeepAlivePingTimeout);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SiteStreamGrpcClientFactory_FlowsOptionsToCreatedClients()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcKeepAlivePingDelay = TimeSpan.FromSeconds(33),
|
||||
GrpcKeepAlivePingTimeout = TimeSpan.FromSeconds(11)
|
||||
};
|
||||
|
||||
using var factory = new SiteStreamGrpcClientFactory(
|
||||
NullLoggerFactory.Instance, Options.Create(options));
|
||||
|
||||
var client = factory.GetOrCreate("site1", "http://localhost:9999");
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(33), client.KeepAlivePingDelay);
|
||||
Assert.Equal(TimeSpan.FromSeconds(11), client.KeepAlivePingTimeout);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SiteStreamGrpcServer_BindsMaxConcurrentStreamsAndLifetimeFromOptions()
|
||||
{
|
||||
var options = new CommunicationOptions
|
||||
{
|
||||
GrpcMaxConcurrentStreams = 250,
|
||||
GrpcMaxStreamLifetime = TimeSpan.FromHours(2)
|
||||
};
|
||||
|
||||
var subscriber = Substitute.For<ISiteStreamSubscriber>();
|
||||
var server = new SiteStreamGrpcServer(
|
||||
subscriber, NullLogger<SiteStreamGrpcServer>.Instance, Options.Create(options));
|
||||
|
||||
Assert.Equal(250, server.MaxConcurrentStreams);
|
||||
Assert.Equal(TimeSpan.FromHours(2), server.MaxStreamLifetime);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.Communication.Tests.Grpc;
|
||||
|
||||
/// <summary>
|
||||
/// Regression tests for Communication-007 — the factory's synchronous
|
||||
/// <see cref="SiteStreamGrpcClientFactory.Dispose"/> must not block on the
|
||||
/// async disposal path (sync-over-async). It must dispose each client through
|
||||
/// the client's synchronous <see cref="SiteStreamGrpcClient.Dispose"/>.
|
||||
/// </summary>
|
||||
public class SiteStreamGrpcClientFactoryDisposeTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test client that records whether it was disposed via the sync or async path.
|
||||
/// </summary>
|
||||
private sealed class TrackingClient : SiteStreamGrpcClient
|
||||
{
|
||||
public bool SyncDisposeCalled { get; private set; }
|
||||
public bool AsyncDisposeCalled { get; private set; }
|
||||
|
||||
public override void Dispose() => SyncDisposeCalled = true;
|
||||
|
||||
public override ValueTask DisposeAsync()
|
||||
{
|
||||
AsyncDisposeCalled = true;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test factory that hands out <see cref="TrackingClient"/> instances while
|
||||
/// still exercising the base factory's real caching and disposal machinery.
|
||||
/// </summary>
|
||||
private sealed class TrackingFactory : SiteStreamGrpcClientFactory
|
||||
{
|
||||
private readonly ConcurrentBag<TrackingClient> _created = new();
|
||||
|
||||
public TrackingFactory() : base(NullLoggerFactory.Instance) { }
|
||||
|
||||
public IReadOnlyCollection<TrackingClient> Created => _created.ToList();
|
||||
|
||||
protected override SiteStreamGrpcClient CreateClient(string grpcEndpoint)
|
||||
{
|
||||
var client = new TrackingClient();
|
||||
_created.Add(client);
|
||||
return client;
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Dispose_DisposesClientsSynchronously_NotViaAsyncPath()
|
||||
{
|
||||
var factory = new TrackingFactory();
|
||||
factory.GetOrCreate("site-a", "http://localhost:5100");
|
||||
factory.GetOrCreate("site-b", "http://localhost:5200");
|
||||
|
||||
factory.Dispose();
|
||||
|
||||
Assert.NotEmpty(factory.Created);
|
||||
Assert.All(factory.Created, c =>
|
||||
{
|
||||
Assert.True(c.SyncDisposeCalled, "client should be disposed via synchronous Dispose()");
|
||||
Assert.False(c.AsyncDisposeCalled, "synchronous Dispose() must not route through DisposeAsync()");
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Dispose_DoesNotDeadlock_UnderSingleThreadedSynchronizationContext()
|
||||
{
|
||||
// A strict single-threaded SynchronizationContext: continuations posted to
|
||||
// it are only pumped by the worker loop. Sync-over-async (blocking the only
|
||||
// thread on an async continuation that needs that same thread) deadlocks here.
|
||||
using var ctx = new SingleThreadSyncContext();
|
||||
Exception? captured = null;
|
||||
var done = new ManualResetEventSlim();
|
||||
|
||||
ctx.Post(_ =>
|
||||
{
|
||||
try
|
||||
{
|
||||
var factory = new SiteStreamGrpcClientFactory(NullLoggerFactory.Instance);
|
||||
factory.GetOrCreate("site-a", "http://localhost:5100");
|
||||
factory.Dispose();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
captured = ex;
|
||||
}
|
||||
finally
|
||||
{
|
||||
done.Set();
|
||||
}
|
||||
}, null);
|
||||
|
||||
Assert.True(done.Wait(TimeSpan.FromSeconds(5)),
|
||||
"factory.Dispose() did not complete — likely a sync-over-async deadlock");
|
||||
Assert.Null(captured);
|
||||
}
|
||||
|
||||
/// <summary>Minimal single-threaded synchronization context for the deadlock test.</summary>
|
||||
private sealed class SingleThreadSyncContext : SynchronizationContext, IDisposable
|
||||
{
|
||||
private readonly BlockingCollection<(SendOrPostCallback cb, object? state)> _queue = new();
|
||||
private readonly Thread _thread;
|
||||
|
||||
public SingleThreadSyncContext()
|
||||
{
|
||||
_thread = new Thread(Run) { IsBackground = true };
|
||||
_thread.Start();
|
||||
}
|
||||
|
||||
private void Run()
|
||||
{
|
||||
SetSynchronizationContext(this);
|
||||
foreach (var (cb, state) in _queue.GetConsumingEnumerable())
|
||||
cb(state);
|
||||
}
|
||||
|
||||
public override void Post(SendOrPostCallback d, object? state) => _queue.Add((d, state));
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_queue.CompleteAdding();
|
||||
_thread.Join(TimeSpan.FromSeconds(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user