fix(external-system-gateway): resolve ExternalSystemGateway-002/003 — apply HTTP call timeout, confirm CachedCall no double-dispatch
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 13 |
|
||||
| Open findings | 11 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -109,7 +109,7 @@ transient-retry paths. Fixed by the commit whose message references
|
||||
|--|--|
|
||||
| Severity | High |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:130`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
|
||||
|
||||
**Description**
|
||||
@@ -142,7 +142,24 @@ is classified as transient.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `<pending>`). `InvokeHttpAsync` now enforces a call
|
||||
timeout: `ExternalSystemClient` takes an `IOptions<ExternalSystemGatewayOptions>` and
|
||||
links a `CancellationTokenSource(DefaultHttpTimeout)` with the caller's token before
|
||||
`SendAsync` and the response-body read, so the design's "timeout applies to the HTTP
|
||||
request round-trip" guarantee now holds within the configured window (default 30s)
|
||||
instead of `HttpClient`'s default 100s. A timeout is reclassified as a
|
||||
`TransientExternalSystemException`; a caller-initiated cancellation is distinguished
|
||||
from a timeout and propagated as `OperationCanceledException` rather than being
|
||||
swallowed as transient. Regression tests:
|
||||
`Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow` and
|
||||
`Call_CallerCancellation_IsNotMisreportedAsTimeout`.
|
||||
|
||||
Note (partial scope): the per-*system* `Timeout` field on `ExternalSystemDefinition`
|
||||
remains unimplemented — adding it requires a change to `ScadaLink.Commons`, which is
|
||||
outside this module's edit scope. Until that entity field exists, the configured
|
||||
`DefaultHttpTimeout` is the effective per-call limit for every system. A follow-up
|
||||
against the Commons module should add the `Timeout` field and have `InvokeHttpAsync`
|
||||
prefer it over the default. This is a tracked follow-up, not a regression.
|
||||
|
||||
### ExternalSystemGateway-003 — `CachedCall` double-dispatches the HTTP request
|
||||
|
||||
@@ -150,7 +167,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | High |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:84-117` |
|
||||
|
||||
**Description**
|
||||
@@ -179,7 +196,18 @@ the duplicated logic.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `<pending>`). Re-triage: this finding was already fixed in
|
||||
the codebase as a side effect of the `ExternalSystemGateway-001` fix and is no longer
|
||||
reproducible against the current source. `StoreAndForwardService.EnqueueAsync` gained an
|
||||
`attemptImmediateDelivery` parameter (recommendation approach (b)), and
|
||||
`CachedCallAsync` passes `attemptImmediateDelivery: false` after its own first HTTP
|
||||
attempt — so `EnqueueAsync` buffers the message for the background retry sweep without
|
||||
re-invoking the registered delivery handler, eliminating the duplicate dispatch. A
|
||||
dedicated regression test, `CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler`,
|
||||
was added in this module's test suite: it registers a counting delivery handler, drives
|
||||
a `CachedCall` whose HTTP attempt fails transiently, and asserts the handler is invoked
|
||||
zero times during enqueue. The test was verified to fail if `attemptImmediateDelivery`
|
||||
is flipped back to `true`.
|
||||
|
||||
### ExternalSystemGateway-004 — System retry settings are not honoured for cached calls/writes
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ using System.Net.Http.Headers;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.ExternalSystems;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
@@ -22,17 +23,20 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
private readonly IExternalSystemRepository _repository;
|
||||
private readonly StoreAndForwardService? _storeAndForward;
|
||||
private readonly ILogger<ExternalSystemClient> _logger;
|
||||
private readonly ExternalSystemGatewayOptions _options;
|
||||
|
||||
public ExternalSystemClient(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
IExternalSystemRepository repository,
|
||||
ILogger<ExternalSystemClient> logger,
|
||||
StoreAndForwardService? storeAndForward = null)
|
||||
StoreAndForwardService? storeAndForward = null,
|
||||
IOptions<ExternalSystemGatewayOptions>? options = null)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_repository = repository;
|
||||
_logger = logger;
|
||||
_storeAndForward = storeAndForward;
|
||||
_options = options?.Value ?? new ExternalSystemGatewayOptions();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -198,22 +202,59 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
}
|
||||
}
|
||||
|
||||
// Enforce the per-call timeout. ExternalSystemDefinition has no per-system
|
||||
// Timeout field yet, so the configured DefaultHttpTimeout is the effective
|
||||
// round-trip limit (the design's "timeout applies to the HTTP request
|
||||
// round-trip" guarantee). A linked CTS lets us distinguish a timeout from a
|
||||
// caller-initiated cancellation: only the timeout is reclassified as transient.
|
||||
using var timeoutCts = new CancellationTokenSource(_options.DefaultHttpTimeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
|
||||
cancellationToken, timeoutCts.Token);
|
||||
|
||||
HttpResponseMessage response;
|
||||
try
|
||||
{
|
||||
response = await client.SendAsync(request, cancellationToken);
|
||||
response = await client.SendAsync(request, linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// The caller asked to abandon the work — do not reclassify as transient.
|
||||
throw;
|
||||
}
|
||||
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
// Our own timeout elapsed — a transient failure per the design.
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"Timeout calling {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||
}
|
||||
catch (Exception ex) when (ErrorClassifier.IsTransient(ex))
|
||||
{
|
||||
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
|
||||
}
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
// The timeout also covers reading the response body (the design's
|
||||
// "round-trip" guarantee), so the linked token is used for the read too.
|
||||
string body;
|
||||
try
|
||||
{
|
||||
return await response.Content.ReadAsStringAsync(cancellationToken);
|
||||
body = await response.Content.ReadAsStringAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||
}
|
||||
|
||||
var errorBody = await response.Content.ReadAsStringAsync(cancellationToken);
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
return body;
|
||||
}
|
||||
|
||||
var errorBody = body;
|
||||
|
||||
if (ErrorClassifier.IsTransient(response.StatusCode))
|
||||
{
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
using System.Net;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using NSubstitute;
|
||||
using ScadaLink.Commons.Entities.ExternalSystems;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.StoreAndForward;
|
||||
|
||||
namespace ScadaLink.ExternalSystemGateway.Tests;
|
||||
|
||||
@@ -216,6 +218,116 @@ public class ExternalSystemClientTests
|
||||
() => client.DeliverBufferedAsync(BufferedCall("TestAPI", "failMethod")));
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-003: CachedCall must not double-dispatch ──
|
||||
|
||||
[Fact]
|
||||
public async Task CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
// The HTTP layer always fails transiently (500).
|
||||
var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.InternalServerError, "boom"));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
// A real S&F service with a registered delivery handler that counts invocations.
|
||||
var dbName = $"EsgDoubleDispatch_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
var sfOptions = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultRetryInterval = TimeSpan.FromMinutes(10),
|
||||
RetryTimerInterval = TimeSpan.FromMinutes(10),
|
||||
};
|
||||
var sf = new StoreAndForwardService(storage, sfOptions, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
var handlerInvocations = 0;
|
||||
sf.RegisterDeliveryHandler(
|
||||
ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem,
|
||||
_ => { Interlocked.Increment(ref handlerInvocations); return Task.FromResult(false); });
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository,
|
||||
NullLogger<ExternalSystemClient>.Instance,
|
||||
storeAndForward: sf);
|
||||
|
||||
var result = await client.CachedCallAsync("TestAPI", "postData");
|
||||
|
||||
// The call already made one HTTP attempt; EnqueueAsync must NOT invoke the
|
||||
// registered handler again synchronously (which would dispatch a 2nd request).
|
||||
Assert.True(result.WasBuffered);
|
||||
Assert.Equal(0, handlerInvocations);
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-002: per-system call timeout ──
|
||||
|
||||
[Fact]
|
||||
public async Task Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
// Handler that hangs far longer than the configured timeout and the test budget.
|
||||
var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10)));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
// Configure a short timeout so the call must fail quickly.
|
||||
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMilliseconds(200) };
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository,
|
||||
NullLogger<ExternalSystemClient>.Instance,
|
||||
options: Microsoft.Extensions.Options.Options.Create(options));
|
||||
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
var result = await client.CallAsync("TestAPI", "getData");
|
||||
sw.Stop();
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("Transient error", result.ErrorMessage);
|
||||
Assert.Contains("Timeout", result.ErrorMessage);
|
||||
// Must fail near the configured 200ms, well before HttpClient's default 100s.
|
||||
Assert.True(sw.Elapsed < TimeSpan.FromSeconds(10),
|
||||
$"Call took {sw.Elapsed}, expected to time out near the configured 200ms window");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Call_CallerCancellation_IsNotMisreportedAsTimeout()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10)));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) };
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository,
|
||||
NullLogger<ExternalSystemClient>.Instance,
|
||||
options: Microsoft.Extensions.Options.Options.Create(options));
|
||||
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(200));
|
||||
|
||||
// Caller-initiated cancellation must surface as OperationCanceledException,
|
||||
// not be swallowed as a transient timeout error.
|
||||
await Assert.ThrowsAnyAsync<OperationCanceledException>(
|
||||
() => client.CallAsync("TestAPI", "getData", cancellationToken: cts.Token));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: mock HTTP message handler.
|
||||
/// </summary>
|
||||
@@ -238,4 +350,20 @@ public class ExternalSystemClientTests
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: an HTTP handler that hangs until cancelled (simulates a slow/hung system).
|
||||
/// </summary>
|
||||
private class HangingHttpMessageHandler : HttpMessageHandler
|
||||
{
|
||||
private readonly TimeSpan _delay;
|
||||
|
||||
public HangingHttpMessageHandler(TimeSpan delay) => _delay = delay;
|
||||
|
||||
protected override async Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
|
||||
{
|
||||
await Task.Delay(_delay, cancellationToken);
|
||||
return new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("{}") };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../../src/ScadaLink.ExternalSystemGateway/ScadaLink.ExternalSystemGateway.csproj" />
|
||||
<ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||
<ProjectReference Include="../../src/ScadaLink.StoreAndForward/ScadaLink.StoreAndForward.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
Reference in New Issue
Block a user