fix(communication): resolve Communication-001 — early stream termination handling
DebugStreamService.StartStreamAsync awaited the initial debug snapshot inside a try whose only handler was catch (OperationCanceledException). When the stream terminated before the snapshot arrived, onTerminatedWrapper completed the await with an InvalidOperationException that escaped the catch — the caller got a raw, untranslated exception and the service did no teardown of its own on that path. Replaced with catch (Exception): it removes the session entry, sends StopDebugStream to the bridge actor via the local reference (deterministic teardown, idempotent), and throws a descriptive exception — TimeoutException for the 30s timeout, otherwise an InvalidOperationException naming the instance/site and wrapping the cause. Re-triaged Critical -> Medium: the originally-claimed multi-minute site-side resource leak does not occur (the bridge actor self-terminates on every onTerminated path). Adds the first DebugStreamService test, which fails against the pre-fix code.
This commit is contained in:
@@ -127,20 +127,36 @@ public class DebugStreamService
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(TimeSpan.FromSeconds(30));
|
||||
|
||||
DebugViewSnapshot snapshot;
|
||||
try
|
||||
{
|
||||
var snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
|
||||
|
||||
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
|
||||
sessionId, instanceUniqueName, siteIdentifier);
|
||||
|
||||
return new DebugStreamSession(sessionId, snapshot);
|
||||
snapshot = await snapshotTcs.Task.WaitAsync(timeoutCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
catch (Exception ex)
|
||||
{
|
||||
StopStream(sessionId);
|
||||
throw new TimeoutException($"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
|
||||
// Any failure before the snapshot arrives — the 30s timeout, or the stream
|
||||
// terminating early (site disconnect / gRPC failure, surfaced by
|
||||
// onTerminatedWrapper as an InvalidOperationException) — must deterministically
|
||||
// tear down the bridge actor and its site-side subscription. Use the local
|
||||
// actor reference: a racing onTerminatedWrapper may already have removed the
|
||||
// session, which would make StopStream a no-op. StopDebugStream is idempotent
|
||||
// (the actor may already be stopping itself).
|
||||
_sessions.TryRemove(sessionId, out _);
|
||||
bridgeActor.Tell(new StopDebugStream());
|
||||
|
||||
if (ex is OperationCanceledException)
|
||||
throw new TimeoutException(
|
||||
$"Timed out waiting for debug snapshot from {instanceUniqueName} on site {siteIdentifier}.");
|
||||
|
||||
throw new InvalidOperationException(
|
||||
$"Debug stream for {instanceUniqueName} on site {siteIdentifier} terminated before a snapshot was received.",
|
||||
ex);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Debug stream {SessionId} started for {Instance} on site {Site}",
|
||||
sessionId, instanceUniqueName, siteIdentifier);
|
||||
|
||||
return new DebugStreamSession(sessionId, snapshot);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
Reference in New Issue
Block a user