fix(external-system-gateway): resolve ExternalSystemGateway-002/003 — apply HTTP call timeout, confirm CachedCall no double-dispatch

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent ab098bf6c8
commit 340a70f0e6
4 changed files with 208 additions and 10 deletions

View File

@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
| Open findings | 13 |
| Open findings | 11 |
## Summary
@@ -109,7 +109,7 @@ transient-retry paths. Fixed by the commit whose message references
|--|--|
| Severity | High |
| Category | Error handling & resilience |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:130`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` |
**Description**
@@ -142,7 +142,24 @@ is classified as transient.
**Resolution**
_Unresolved._
Resolved 2026-05-16 (commit `<pending>`). `InvokeHttpAsync` now enforces a call
timeout: `ExternalSystemClient` takes an `IOptions<ExternalSystemGatewayOptions>` and
links a `CancellationTokenSource(DefaultHttpTimeout)` with the caller's token before
`SendAsync` and the response-body read, so the design's "timeout applies to the HTTP
request round-trip" guarantee now holds within the configured window (default 30s)
instead of `HttpClient`'s default 100s. A timeout is reclassified as a
`TransientExternalSystemException`; a caller-initiated cancellation is distinguished
from a timeout and propagated as `OperationCanceledException` rather than being
swallowed as transient. Regression tests:
`Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow` and
`Call_CallerCancellation_IsNotMisreportedAsTimeout`.
Note (partial scope): the per-*system* `Timeout` field on `ExternalSystemDefinition`
remains unimplemented — adding it requires a change to `ScadaLink.Commons`, which is
outside this module's edit scope. Until that entity field exists, the configured
`DefaultHttpTimeout` is the effective per-call limit for every system. A follow-up
against the Commons module should add the `Timeout` field and have `InvokeHttpAsync`
prefer it over the default. This is a tracked follow-up, not a regression.
### ExternalSystemGateway-003 — `CachedCall` double-dispatches the HTTP request
@@ -150,7 +167,7 @@ _Unresolved._
|--|--|
| Severity | High |
| Category | Correctness & logic bugs |
| Status | Open |
| Status | Resolved |
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:84-117` |
**Description**
@@ -179,7 +196,18 @@ the duplicated logic.
**Resolution**
_Unresolved._
Resolved 2026-05-16 (commit `<pending>`). Re-triage: this finding was already fixed in
the codebase as a side effect of the `ExternalSystemGateway-001` fix and is no longer
reproducible against the current source. `StoreAndForwardService.EnqueueAsync` gained an
`attemptImmediateDelivery` parameter (recommendation approach (b)), and
`CachedCallAsync` passes `attemptImmediateDelivery: false` after its own first HTTP
attempt — so `EnqueueAsync` buffers the message for the background retry sweep without
re-invoking the registered delivery handler, eliminating the duplicate dispatch. A
dedicated regression test, `CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler`,
was added in this module's test suite: it registers a counting delivery handler, drives
a `CachedCall` whose HTTP attempt fails transiently, and asserts the handler is invoked
zero times during enqueue. The test was verified to fail if `attemptImmediateDelivery`
is flipped back to `true`.
### ExternalSystemGateway-004 — System retry settings are not honoured for cached calls/writes

View File

@@ -3,6 +3,7 @@ using System.Net.Http.Headers;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Entities.ExternalSystems;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
@@ -22,17 +23,20 @@ public class ExternalSystemClient : IExternalSystemClient
private readonly IExternalSystemRepository _repository;
private readonly StoreAndForwardService? _storeAndForward;
private readonly ILogger<ExternalSystemClient> _logger;
private readonly ExternalSystemGatewayOptions _options;
public ExternalSystemClient(
IHttpClientFactory httpClientFactory,
IExternalSystemRepository repository,
ILogger<ExternalSystemClient> logger,
StoreAndForwardService? storeAndForward = null)
StoreAndForwardService? storeAndForward = null,
IOptions<ExternalSystemGatewayOptions>? options = null)
{
_httpClientFactory = httpClientFactory;
_repository = repository;
_logger = logger;
_storeAndForward = storeAndForward;
_options = options?.Value ?? new ExternalSystemGatewayOptions();
}
/// <summary>
@@ -198,22 +202,59 @@ public class ExternalSystemClient : IExternalSystemClient
}
}
// Enforce the per-call timeout. ExternalSystemDefinition has no per-system
// Timeout field yet, so the configured DefaultHttpTimeout is the effective
// round-trip limit (the design's "timeout applies to the HTTP request
// round-trip" guarantee). A linked CTS lets us distinguish a timeout from a
// caller-initiated cancellation: only the timeout is reclassified as transient.
using var timeoutCts = new CancellationTokenSource(_options.DefaultHttpTimeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
cancellationToken, timeoutCts.Token);
HttpResponseMessage response;
try
{
response = await client.SendAsync(request, cancellationToken);
response = await client.SendAsync(request, linkedCts.Token);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// The caller asked to abandon the work — do not reclassify as transient.
throw;
}
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
{
// Our own timeout elapsed — a transient failure per the design.
throw ErrorClassifier.AsTransient(
$"Timeout calling {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
}
catch (Exception ex) when (ErrorClassifier.IsTransient(ex))
{
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
}
if (response.IsSuccessStatusCode)
// The timeout also covers reading the response body (the design's
// "round-trip" guarantee), so the linked token is used for the read too.
string body;
try
{
return await response.Content.ReadAsStringAsync(cancellationToken);
body = await response.Content.ReadAsStringAsync(linkedCts.Token);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
throw;
}
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
{
throw ErrorClassifier.AsTransient(
$"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
}
var errorBody = await response.Content.ReadAsStringAsync(cancellationToken);
if (response.IsSuccessStatusCode)
{
return body;
}
var errorBody = body;
if (ErrorClassifier.IsTransient(response.StatusCode))
{

View File

@@ -1,8 +1,10 @@
using System.Net;
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging.Abstractions;
using NSubstitute;
using ScadaLink.Commons.Entities.ExternalSystems;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.StoreAndForward;
namespace ScadaLink.ExternalSystemGateway.Tests;
@@ -216,6 +218,116 @@ public class ExternalSystemClientTests
() => client.DeliverBufferedAsync(BufferedCall("TestAPI", "failMethod")));
}
// ── ExternalSystemGateway-003: CachedCall must not double-dispatch ──
[Fact]
public async Task CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler()
{
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 };
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemDefinition> { system });
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemMethod> { method });
// The HTTP layer always fails transiently (500).
var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.InternalServerError, "boom"));
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
// A real S&F service with a registered delivery handler that counts invocations.
var dbName = $"EsgDoubleDispatch_{Guid.NewGuid():N}";
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
using var keepAlive = new SqliteConnection(connStr);
keepAlive.Open();
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
await storage.InitializeAsync();
var sfOptions = new StoreAndForwardOptions
{
DefaultRetryInterval = TimeSpan.FromMinutes(10),
RetryTimerInterval = TimeSpan.FromMinutes(10),
};
var sf = new StoreAndForwardService(storage, sfOptions, NullLogger<StoreAndForwardService>.Instance);
var handlerInvocations = 0;
sf.RegisterDeliveryHandler(
ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem,
_ => { Interlocked.Increment(ref handlerInvocations); return Task.FromResult(false); });
var client = new ExternalSystemClient(
_httpClientFactory, _repository,
NullLogger<ExternalSystemClient>.Instance,
storeAndForward: sf);
var result = await client.CachedCallAsync("TestAPI", "postData");
// The call already made one HTTP attempt; EnqueueAsync must NOT invoke the
// registered handler again synchronously (which would dispatch a 2nd request).
Assert.True(result.WasBuffered);
Assert.Equal(0, handlerInvocations);
}
// ── ExternalSystemGateway-002: per-system call timeout ──
[Fact]
public async Task Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow()
{
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemDefinition> { system });
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemMethod> { method });
// Handler that hangs far longer than the configured timeout and the test budget.
var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10)));
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
// Configure a short timeout so the call must fail quickly.
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMilliseconds(200) };
var client = new ExternalSystemClient(
_httpClientFactory, _repository,
NullLogger<ExternalSystemClient>.Instance,
options: Microsoft.Extensions.Options.Options.Create(options));
var sw = System.Diagnostics.Stopwatch.StartNew();
var result = await client.CallAsync("TestAPI", "getData");
sw.Stop();
Assert.False(result.Success);
Assert.Contains("Transient error", result.ErrorMessage);
Assert.Contains("Timeout", result.ErrorMessage);
// Must fail near the configured 200ms, well before HttpClient's default 100s.
Assert.True(sw.Elapsed < TimeSpan.FromSeconds(10),
$"Call took {sw.Elapsed}, expected to time out near the configured 200ms window");
}
[Fact]
public async Task Call_CallerCancellation_IsNotMisreportedAsTimeout()
{
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemDefinition> { system });
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
.Returns(new List<ExternalSystemMethod> { method });
var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10)));
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) };
var client = new ExternalSystemClient(
_httpClientFactory, _repository,
NullLogger<ExternalSystemClient>.Instance,
options: Microsoft.Extensions.Options.Options.Create(options));
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(200));
// Caller-initiated cancellation must surface as OperationCanceledException,
// not be swallowed as a transient timeout error.
await Assert.ThrowsAnyAsync<OperationCanceledException>(
() => client.CallAsync("TestAPI", "getData", cancellationToken: cts.Token));
}
/// <summary>
/// Test helper: mock HTTP message handler.
/// </summary>
@@ -238,4 +350,20 @@ public class ExternalSystemClientTests
});
}
}
/// <summary>
/// Test helper: an HTTP handler that hangs until cancelled (simulates a slow/hung system).
/// </summary>
private class HangingHttpMessageHandler : HttpMessageHandler
{
private readonly TimeSpan _delay;
public HangingHttpMessageHandler(TimeSpan delay) => _delay = delay;
protected override async Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
await Task.Delay(_delay, cancellationToken);
return new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("{}") };
}
}
}

View File

@@ -23,6 +23,7 @@
<ItemGroup>
<ProjectReference Include="../../src/ScadaLink.ExternalSystemGateway/ScadaLink.ExternalSystemGateway.csproj" />
<ProjectReference Include="../../src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
<ProjectReference Include="../../src/ScadaLink.StoreAndForward/ScadaLink.StoreAndForward.csproj" />
</ItemGroup>
</Project>