diff --git a/code-reviews/ExternalSystemGateway/findings.md b/code-reviews/ExternalSystemGateway/findings.md index 71744a1..cc7c050 100644 --- a/code-reviews/ExternalSystemGateway/findings.md +++ b/code-reviews/ExternalSystemGateway/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 13 | +| Open findings | 11 | ## Summary @@ -109,7 +109,7 @@ transient-retry paths. Fixed by the commit whose message references |--|--| | Severity | High | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:130`, `src/ScadaLink.ExternalSystemGateway/ServiceCollectionExtensions.cs:13` | **Description** @@ -142,7 +142,24 @@ is classified as transient. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit ``). `InvokeHttpAsync` now enforces a call +timeout: `ExternalSystemClient` takes an `IOptions` and +links a `CancellationTokenSource(DefaultHttpTimeout)` with the caller's token before +`SendAsync` and the response-body read, so the design's "timeout applies to the HTTP +request round-trip" guarantee now holds within the configured window (default 30s) +instead of `HttpClient`'s default 100s. A timeout is reclassified as a +`TransientExternalSystemException`; a caller-initiated cancellation is distinguished +from a timeout and propagated as `OperationCanceledException` rather than being +swallowed as transient. Regression tests: +`Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow` and +`Call_CallerCancellation_IsNotMisreportedAsTimeout`. + +Note (partial scope): the per-*system* `Timeout` field on `ExternalSystemDefinition` +remains unimplemented — adding it requires a change to `ScadaLink.Commons`, which is +outside this module's edit scope. Until that entity field exists, the configured +`DefaultHttpTimeout` is the effective per-call limit for every system. A follow-up +against the Commons module should add the `Timeout` field and have `InvokeHttpAsync` +prefer it over the default. This is a tracked follow-up, not a regression. ### ExternalSystemGateway-003 — `CachedCall` double-dispatches the HTTP request @@ -150,7 +167,7 @@ _Unresolved._ |--|--| | Severity | High | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:84-117` | **Description** @@ -179,7 +196,18 @@ the duplicated logic. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit ``). Re-triage: this finding was already fixed in +the codebase as a side effect of the `ExternalSystemGateway-001` fix and is no longer +reproducible against the current source. `StoreAndForwardService.EnqueueAsync` gained an +`attemptImmediateDelivery` parameter (recommendation approach (b)), and +`CachedCallAsync` passes `attemptImmediateDelivery: false` after its own first HTTP +attempt — so `EnqueueAsync` buffers the message for the background retry sweep without +re-invoking the registered delivery handler, eliminating the duplicate dispatch. A +dedicated regression test, `CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler`, +was added in this module's test suite: it registers a counting delivery handler, drives +a `CachedCall` whose HTTP attempt fails transiently, and asserts the handler is invoked +zero times during enqueue. The test was verified to fail if `attemptImmediateDelivery` +is flipped back to `true`. ### ExternalSystemGateway-004 — System retry settings are not honoured for cached calls/writes diff --git a/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs b/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs index d9c7dfd..2408927 100644 --- a/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs +++ b/src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs @@ -3,6 +3,7 @@ using System.Net.Http.Headers; using System.Text; using System.Text.Json; using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; using ScadaLink.Commons.Entities.ExternalSystems; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; @@ -22,17 +23,20 @@ public class ExternalSystemClient : IExternalSystemClient private readonly IExternalSystemRepository _repository; private readonly StoreAndForwardService? _storeAndForward; private readonly ILogger _logger; + private readonly ExternalSystemGatewayOptions _options; public ExternalSystemClient( IHttpClientFactory httpClientFactory, IExternalSystemRepository repository, ILogger logger, - StoreAndForwardService? storeAndForward = null) + StoreAndForwardService? storeAndForward = null, + IOptions? options = null) { _httpClientFactory = httpClientFactory; _repository = repository; _logger = logger; _storeAndForward = storeAndForward; + _options = options?.Value ?? new ExternalSystemGatewayOptions(); } /// @@ -198,22 +202,59 @@ public class ExternalSystemClient : IExternalSystemClient } } + // Enforce the per-call timeout. ExternalSystemDefinition has no per-system + // Timeout field yet, so the configured DefaultHttpTimeout is the effective + // round-trip limit (the design's "timeout applies to the HTTP request + // round-trip" guarantee). A linked CTS lets us distinguish a timeout from a + // caller-initiated cancellation: only the timeout is reclassified as transient. + using var timeoutCts = new CancellationTokenSource(_options.DefaultHttpTimeout); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource( + cancellationToken, timeoutCts.Token); + HttpResponseMessage response; try { - response = await client.SendAsync(request, cancellationToken); + response = await client.SendAsync(request, linkedCts.Token); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // The caller asked to abandon the work — do not reclassify as transient. + throw; + } + catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested) + { + // Our own timeout elapsed — a transient failure per the design. + throw ErrorClassifier.AsTransient( + $"Timeout calling {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex); } catch (Exception ex) when (ErrorClassifier.IsTransient(ex)) { throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex); } - if (response.IsSuccessStatusCode) + // The timeout also covers reading the response body (the design's + // "round-trip" guarantee), so the linked token is used for the read too. + string body; + try { - return await response.Content.ReadAsStringAsync(cancellationToken); + body = await response.Content.ReadAsStringAsync(linkedCts.Token); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested) + { + throw ErrorClassifier.AsTransient( + $"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex); } - var errorBody = await response.Content.ReadAsStringAsync(cancellationToken); + if (response.IsSuccessStatusCode) + { + return body; + } + + var errorBody = body; if (ErrorClassifier.IsTransient(response.StatusCode)) { diff --git a/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs b/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs index 7735613..7d8f23b 100644 --- a/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs +++ b/tests/ScadaLink.ExternalSystemGateway.Tests/ExternalSystemClientTests.cs @@ -1,8 +1,10 @@ using System.Net; +using Microsoft.Data.Sqlite; using Microsoft.Extensions.Logging.Abstractions; using NSubstitute; using ScadaLink.Commons.Entities.ExternalSystems; using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.StoreAndForward; namespace ScadaLink.ExternalSystemGateway.Tests; @@ -216,6 +218,116 @@ public class ExternalSystemClientTests () => client.DeliverBufferedAsync(BufferedCall("TestAPI", "failMethod"))); } + // ── ExternalSystemGateway-003: CachedCall must not double-dispatch ── + + [Fact] + public async Task CachedCall_TransientFailure_DoesNotImmediatelyRedispatchViaRegisteredHandler() + { + var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 }; + var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 }; + _repository.GetAllExternalSystemsAsync(Arg.Any()) + .Returns(new List { system }); + _repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any()) + .Returns(new List { method }); + + // The HTTP layer always fails transiently (500). + var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.InternalServerError, "boom")); + _httpClientFactory.CreateClient(Arg.Any()).Returns(httpClient); + + // A real S&F service with a registered delivery handler that counts invocations. + var dbName = $"EsgDoubleDispatch_{Guid.NewGuid():N}"; + var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared"; + using var keepAlive = new SqliteConnection(connStr); + keepAlive.Open(); + var storage = new StoreAndForwardStorage(connStr, NullLogger.Instance); + await storage.InitializeAsync(); + var sfOptions = new StoreAndForwardOptions + { + DefaultRetryInterval = TimeSpan.FromMinutes(10), + RetryTimerInterval = TimeSpan.FromMinutes(10), + }; + var sf = new StoreAndForwardService(storage, sfOptions, NullLogger.Instance); + + var handlerInvocations = 0; + sf.RegisterDeliveryHandler( + ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem, + _ => { Interlocked.Increment(ref handlerInvocations); return Task.FromResult(false); }); + + var client = new ExternalSystemClient( + _httpClientFactory, _repository, + NullLogger.Instance, + storeAndForward: sf); + + var result = await client.CachedCallAsync("TestAPI", "postData"); + + // The call already made one HTTP attempt; EnqueueAsync must NOT invoke the + // registered handler again synchronously (which would dispatch a 2nd request). + Assert.True(result.WasBuffered); + Assert.Equal(0, handlerInvocations); + } + + // ── ExternalSystemGateway-002: per-system call timeout ── + + [Fact] + public async Task Call_SlowSystem_TimesOutAsTransientErrorWithinConfiguredWindow() + { + var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 }; + var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 }; + _repository.GetAllExternalSystemsAsync(Arg.Any()) + .Returns(new List { system }); + _repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any()) + .Returns(new List { method }); + + // Handler that hangs far longer than the configured timeout and the test budget. + var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10))); + _httpClientFactory.CreateClient(Arg.Any()).Returns(httpClient); + + // Configure a short timeout so the call must fail quickly. + var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMilliseconds(200) }; + var client = new ExternalSystemClient( + _httpClientFactory, _repository, + NullLogger.Instance, + options: Microsoft.Extensions.Options.Options.Create(options)); + + var sw = System.Diagnostics.Stopwatch.StartNew(); + var result = await client.CallAsync("TestAPI", "getData"); + sw.Stop(); + + Assert.False(result.Success); + Assert.Contains("Transient error", result.ErrorMessage); + Assert.Contains("Timeout", result.ErrorMessage); + // Must fail near the configured 200ms, well before HttpClient's default 100s. + Assert.True(sw.Elapsed < TimeSpan.FromSeconds(10), + $"Call took {sw.Elapsed}, expected to time out near the configured 200ms window"); + } + + [Fact] + public async Task Call_CallerCancellation_IsNotMisreportedAsTimeout() + { + var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 }; + var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 }; + _repository.GetAllExternalSystemsAsync(Arg.Any()) + .Returns(new List { system }); + _repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any()) + .Returns(new List { method }); + + var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10))); + _httpClientFactory.CreateClient(Arg.Any()).Returns(httpClient); + + var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) }; + var client = new ExternalSystemClient( + _httpClientFactory, _repository, + NullLogger.Instance, + options: Microsoft.Extensions.Options.Options.Create(options)); + + using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(200)); + + // Caller-initiated cancellation must surface as OperationCanceledException, + // not be swallowed as a transient timeout error. + await Assert.ThrowsAnyAsync( + () => client.CallAsync("TestAPI", "getData", cancellationToken: cts.Token)); + } + /// /// Test helper: mock HTTP message handler. /// @@ -238,4 +350,20 @@ public class ExternalSystemClientTests }); } } + + /// + /// Test helper: an HTTP handler that hangs until cancelled (simulates a slow/hung system). + /// + private class HangingHttpMessageHandler : HttpMessageHandler + { + private readonly TimeSpan _delay; + + public HangingHttpMessageHandler(TimeSpan delay) => _delay = delay; + + protected override async Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + await Task.Delay(_delay, cancellationToken); + return new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("{}") }; + } + } } diff --git a/tests/ScadaLink.ExternalSystemGateway.Tests/ScadaLink.ExternalSystemGateway.Tests.csproj b/tests/ScadaLink.ExternalSystemGateway.Tests/ScadaLink.ExternalSystemGateway.Tests.csproj index d610a02..ce9ca80 100644 --- a/tests/ScadaLink.ExternalSystemGateway.Tests/ScadaLink.ExternalSystemGateway.Tests.csproj +++ b/tests/ScadaLink.ExternalSystemGateway.Tests/ScadaLink.ExternalSystemGateway.Tests.csproj @@ -23,6 +23,7 @@ +