fix(transport): robust failure-audit when rollback throws + doc clarifications

Address one Blocker and three Important findings from code review of
2c34f12 (BundleImporter.ApplyAsync):

- BLOCKER: wrap RollbackAsync in nested try/catch so a rollback fault
  does not swallow the BundleImportFailed audit row. Dispose the
  failed transaction before the audit-write so the new SaveChangesAsync
  uses a fresh implicit transaction instead of enlisting in the broken
  one. Surface the rollback exception's message on the failure row
  alongside the original cause, and swallow audit-write faults per the
  design's best-effort-audit invariant. Add regression integration
  test using a SQLite transaction interceptor that throws on rollback.

- Document re-entrancy assumption on IAuditCorrelationContext: scoped
  lifetime, single circuit, concurrent imports within a shared scope
  must serialize externally.

- Document repository audit responsibility on BundleImporter: repos
  are thin EF wrappers; ApplyAsync writes audit rows explicitly. If
  repos ever start emitting audit rows, the explicit calls here must
  be removed to avoid double-logging.

- Document BundleSessionStore thread-safety: ConcurrentDictionary
  primitives are safe under concurrent callers; BundleSession itself
  is not thread-safe.
This commit is contained in:
Joseph Doherty
2026-05-24 05:06:04 -04:00
parent 2c34f12a6f
commit cda80cf821
4 changed files with 397 additions and 16 deletions

View File

@@ -4,6 +4,17 @@ namespace ScadaLink.Commons.Interfaces.Transport;
/// Scoped service the bundle importer sets to thread a BundleImportId through to
/// the audit log entries emitted by the audited repository methods invoked during
/// ApplyAsync. AuditService reads this and stamps every AuditLogEntry it writes.
/// <para>
/// Re-entrancy / thread-safety: mutating <see cref="BundleImportId"/> is NOT
/// thread-safe. The service is registered scoped, and the assumed usage is a
/// single Blazor Server circuit (or single API request) at a time — within that
/// scope <see cref="BundleImporter.ApplyAsync"/> is the sole writer, and the
/// audit service is the sole reader, in a strictly sequential await chain.
/// Callers that perform concurrent imports within a shared scope (e.g. two
/// <c>ApplyAsync</c> calls awaited via <c>Task.WhenAll</c> on the same circuit)
/// MUST serialize access externally — there is no internal lock and the last
/// writer wins, which would cross-contaminate audit rows between imports.
/// </para>
/// </summary>
public interface IAuditCorrelationContext
{

View File

@@ -25,6 +25,19 @@ namespace ScadaLink.Transport.Import;
/// resolutions through the audited repositories. Only LoadAsync is
/// implemented in this slice — the other two are wired into DI now so
/// follow-up tasks can fill them in without churning the constructor.
/// <para>
/// Audit-row responsibility: repository mutation methods in
/// <c>ScadaLink.ConfigurationDatabase.Repositories</c> are thin EF wrappers
/// and do NOT emit audit rows. <see cref="ApplyAsync"/> therefore writes
/// each per-entity audit row explicitly via <see cref="IAuditService.LogAsync"/>;
/// the scoped <see cref="IAuditCorrelationContext.BundleImportId"/> is
/// automatically stamped on each row by the audit service.
/// </para>
/// <para>
/// If repository methods are ever changed to emit audit rows themselves,
/// the explicit <c>LogAsync</c> calls in this class must be removed to
/// avoid double-logging.
/// </para>
/// </summary>
public sealed class BundleImporter : IBundleImporter
{
@@ -562,11 +575,41 @@ public sealed class BundleImporter : IBundleImporter
}
catch (Exception ex)
{
await tx.RollbackAsync(ct).ConfigureAwait(false);
// Rollback can itself throw (connection drop mid-rollback, provider
// bug, etc). If it does, we must STILL write the BundleImportFailed
// audit row — otherwise a rollback-failure path silently swallows
// the import's audit trail. Capture the rollback exception (if any)
// and surface it on the failure row alongside the original cause.
Exception? rollbackFailure = null;
try
{
await tx.RollbackAsync(ct).ConfigureAwait(false);
}
catch (Exception rbEx)
{
rollbackFailure = rbEx;
}
// If rollback threw the IDbContextTransaction is in an indeterminate
// state and still associated with the DbContext — a subsequent
// SaveChangesAsync would attempt to enlist in (or commit to) that
// broken transaction, and the failure-row would itself be rolled
// back when the transaction is finally disposed. Dispose it now so
// the audit-row write below uses a fresh implicit transaction. On
// the happy rollback path Dispose is a benign no-op (the using
// would call it on scope exit anyway).
if (rollbackFailure is not null)
{
try { await tx.DisposeAsync().ConfigureAwait(false); }
catch { /* dispose-after-throw must not mask the original cause */ }
}
// Clear the change tracker before writing the failure row — on the
// in-memory provider the rollback is a no-op and the staged adds
// would otherwise persist when the next SaveChangesAsync runs.
// would otherwise persist when the next SaveChangesAsync runs. This
// also matters when rollback threw: the change tracker is in an
// ambiguous state and we don't want the failure-write to sweep up
// any of the staged apply mutations.
_dbContext.ChangeTracker.Clear();
// Clear correlation FIRST so the failure row doesn't carry the now-
@@ -574,20 +617,32 @@ public sealed class BundleImporter : IBundleImporter
// exists at top level (no correlation) so audit consumers can see
// imports that aborted before any rows landed.
_correlationContext.BundleImportId = null;
await _auditService.LogAsync(
user: user,
action: "BundleImportFailed",
entityType: "Bundle",
entityId: bundleImportId.ToString(),
entityName: session.Manifest.SourceEnvironment,
afterState: new
{
BundleImportId = bundleImportId,
Reason = ex.Message,
ExceptionType = ex.GetType().FullName,
},
cancellationToken: ct).ConfigureAwait(false);
await _dbContext.SaveChangesAsync(ct).ConfigureAwait(false);
try
{
await _auditService.LogAsync(
user: user,
action: "BundleImportFailed",
entityType: "Bundle",
entityId: bundleImportId.ToString(),
entityName: session.Manifest.SourceEnvironment,
afterState: new
{
BundleImportId = bundleImportId,
Reason = ex.Message,
ExceptionType = ex.GetType().FullName,
RollbackException = rollbackFailure?.Message,
},
cancellationToken: ct).ConfigureAwait(false);
await _dbContext.SaveChangesAsync(ct).ConfigureAwait(false);
}
catch
{
// Audit-write is best-effort per design §10 ("Audit-write failure
// NEVER aborts the user-facing action — audit is best-effort, the
// action's own success/failure path is authoritative"). Swallow
// any failure here so the original exception below propagates
// unchanged rather than being masked by an audit-layer fault.
}
throw;
}
finally

View File

@@ -11,6 +11,17 @@ namespace ScadaLink.Transport.Import;
/// at read time (<see cref="Get"/>) and on-demand via <see cref="EvictExpired"/>;
/// there is no background timer.
/// <para>
/// Thread-safety: backed by <see cref="ConcurrentDictionary{TKey,TValue}"/> of
/// <see cref="Guid"/> to <see cref="BundleSession"/>. All store operations
/// (<see cref="Get"/> / <see cref="Open"/> / <see cref="Remove"/> /
/// <see cref="EvictExpired"/>) use the concurrent dictionary's safe primitives
/// (<c>TryGetValue</c>, indexer assignment, <c>TryRemove</c>) and are safe
/// under concurrent callers. The <see cref="BundleSession"/> instance itself
/// is NOT thread-safe — callers that share a session reference (e.g. two
/// importers mutating <c>FailedUnlockAttempts</c> on the same session) MUST
/// serialize their mutations on that shared reference.
/// </para>
/// <para>
/// TTL is supplied by the importer via <see cref="BundleSession.ExpiresAt"/>;
/// this store does not impose its own. The injected <see cref="TimeProvider"/>
/// is used purely to determine <c>now</c> when checking <c>ExpiresAt</c>, which

View File

@@ -0,0 +1,304 @@
using System.Data.Common;
using Microsoft.AspNetCore.DataProtection;
using Microsoft.EntityFrameworkCore;
using Microsoft.EntityFrameworkCore.Diagnostics;
using Microsoft.EntityFrameworkCore.Storage;
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using ScadaLink.Commons.Entities.Deployment;
using ScadaLink.Commons.Entities.Scripts;
using ScadaLink.Commons.Entities.Templates;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Interfaces.Transport;
using ScadaLink.Commons.Types.Transport;
using ScadaLink.ConfigurationDatabase;
using ScadaLink.ConfigurationDatabase.Repositories;
using ScadaLink.ConfigurationDatabase.Services;
using ScadaLink.Transport;
using ScadaLink.Transport.Import;
namespace ScadaLink.Transport.IntegrationTests.Import;
/// <summary>
/// Covers the catch-path invariant in <see cref="BundleImporter.ApplyAsync"/>:
/// even when the EF <c>RollbackAsync</c> itself throws (connection drop mid-
/// rollback, provider bug, etc.) the <c>BundleImportFailed</c> audit row MUST
/// still land, and the ORIGINAL exception (not the rollback failure) MUST
/// propagate to the caller.
/// <para>
/// Uses SQLite rather than the in-memory provider because the in-memory
/// provider's transaction is a no-op — its <c>RollbackAsync</c> never invokes
/// the interceptor, so the throw-on-rollback path can't be exercised. SQLite
/// :memory: is keyed per-connection, so the fixture pins a single open
/// connection across the whole test.
/// </para>
/// <para>
/// The interceptor is wired to throw on <see cref="IDbTransactionInterceptor.TransactionRollingBack"/>
/// and the async equivalent — this is the hook EF invokes synchronously inside
/// <c>IDbContextTransaction.RollbackAsync</c>, so a throw there surfaces as
/// the <c>RollbackAsync</c> call itself throwing, which is exactly the
/// scenario the catch block must survive.
/// </para>
/// </summary>
public sealed class BundleImporterRollbackFailureTests : IDisposable
{
private readonly ServiceProvider _provider;
private readonly DbConnection _sharedConnection;
private readonly ThrowingRollbackInterceptor _interceptor = new();
public BundleImporterRollbackFailureTests()
{
var services = new ServiceCollection();
services.AddSingleton<IConfiguration>(
new ConfigurationBuilder().AddInMemoryCollection().Build());
// Pin a single SQLite :memory: connection for the lifetime of the
// fixture — :memory: is per-connection so the schema would otherwise
// vanish between DbContext instances.
_sharedConnection = new Microsoft.Data.Sqlite.SqliteConnection("DataSource=:memory:");
_sharedConnection.Open();
services.AddSingleton<IDataProtectionProvider>(new EphemeralDataProtectionProvider());
// Register options once under the BASE DbContextOptions key, then
// register the subclass as the scoped service used by repositories +
// AuditService + BundleImporter. The subclass's ctor accepts
// DbContextOptions<ScadaLinkDbContext> (the base type's options) so the
// single options registration serves both. This avoids the multi-options
// pitfall of AddDbContext<TBase, TImpl> which keys options on TImpl.
services.AddSingleton(sp =>
{
var builder = new DbContextOptionsBuilder<ScadaLinkDbContext>();
builder.UseSqlite(_sharedConnection);
builder.ConfigureWarnings(w => w.Ignore(RelationalEventId.PendingModelChangesWarning));
builder.AddInterceptors(_interceptor);
return builder.Options;
});
services.AddScoped<ScadaLinkDbContext>(sp => new SqliteCompatibleScadaLinkDbContext(
sp.GetRequiredService<DbContextOptions<ScadaLinkDbContext>>(),
sp.GetRequiredService<IDataProtectionProvider>()));
services.AddScoped<ITemplateEngineRepository, TemplateEngineRepository>();
services.AddScoped<IExternalSystemRepository, ExternalSystemRepository>();
services.AddScoped<INotificationRepository, NotificationRepository>();
services.AddScoped<IInboundApiRepository, InboundApiRepository>();
services.AddScoped<IAuditCorrelationContext, AuditCorrelationContext>();
services.AddScoped<IAuditService, AuditService>();
services.AddTransport();
_provider = services.BuildServiceProvider();
// Build schema once on the shared connection.
using var scope = _provider.CreateScope();
var ctx = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
ctx.Database.EnsureCreated();
}
public void Dispose()
{
_provider.Dispose();
_sharedConnection.Dispose();
}
[Fact]
public async Task ApplyAsync_writes_BundleImportFailed_even_when_RollbackAsync_throws()
{
// Arrange: seed a template whose script body references MissingHelper()
// so semantic validation will reject the apply (same broken-bundle shape
// as BundleImporterApplyTests.ApplyAsync_rolls_back_all_changes_…). Then
// arm the interceptor to throw on rollback so the catch path has to
// survive a rollback failure.
await using (var scope = _provider.CreateAsyncScope())
{
var ctx = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
var t = new Template("BrokenPump") { Description = "broken" };
t.Scripts.Add(new TemplateScript("init", "var x = MissingHelper();"));
ctx.Templates.Add(t);
await ctx.SaveChangesAsync();
}
var sessionId = await ExportAndLoadAsync();
await WipeContentAsync();
_interceptor.ThrowOnRollback = true;
// Act: ApplyAsync must propagate the ORIGINAL exception
// (SemanticValidationException) — NOT the InvalidOperationException
// that the interceptor raises from inside RollbackAsync.
SemanticValidationException? thrown = null;
await using (var scope = _provider.CreateAsyncScope())
{
var importer = scope.ServiceProvider.GetRequiredService<IBundleImporter>();
thrown = await Assert.ThrowsAsync<SemanticValidationException>(() =>
importer.ApplyAsync(sessionId,
new List<ImportResolution> { new("Template", "BrokenPump", ResolutionAction.Add, null) },
user: "bob"));
}
Assert.NotNull(thrown);
// Assert: even with a rollback failure, the BundleImportFailed audit row
// must have landed — that's the whole point of the fix. The row should
// also carry the rollback failure's message in its AfterStateJson so
// post-mortem readers can see both faults.
_interceptor.ThrowOnRollback = false; // let post-condition reads roll back cleanly
await using (var scope = _provider.CreateAsyncScope())
{
var ctx = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
var failed = await ctx.AuditLogEntries
.SingleOrDefaultAsync(a => a.Action == "BundleImportFailed");
Assert.NotNull(failed);
Assert.Equal("Bundle", failed!.EntityType);
// Correlation MUST be null on the failure row — the rolled-back
// BundleImportId is intentionally disowned (same contract as the
// happy-path rollback test in BundleImporterApplyTests).
Assert.Null(failed.BundleImportId);
Assert.NotNull(failed.AfterStateJson);
// The rollback exception message must be surfaced in the failure
// row so operators can see both the cause and the rollback fault.
Assert.Contains(
ThrowingRollbackInterceptor.RollbackErrorMarker,
failed.AfterStateJson!,
StringComparison.Ordinal);
}
}
// ---- helpers (copies of the patterns from BundleImporterApplyTests) ----
private async Task<Guid> ExportAndLoadAsync()
{
Stream bundleStream;
await using (var scope = _provider.CreateAsyncScope())
{
var exporter = scope.ServiceProvider.GetRequiredService<IBundleExporter>();
var ctx = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
var templateIds = await ctx.Templates.Select(t => t.Id).ToListAsync();
var selection = new ExportSelection(
TemplateIds: templateIds,
SharedScriptIds: Array.Empty<int>(),
ExternalSystemIds: Array.Empty<int>(),
DatabaseConnectionIds: Array.Empty<int>(),
NotificationListIds: Array.Empty<int>(),
SmtpConfigurationIds: Array.Empty<int>(),
ApiKeyIds: Array.Empty<int>(),
ApiMethodIds: Array.Empty<int>(),
IncludeDependencies: false);
bundleStream = await exporter.ExportAsync(selection, user: "alice", sourceEnvironment: "dev",
passphrase: null, cancellationToken: CancellationToken.None);
}
using var ms = new MemoryStream();
await bundleStream.CopyToAsync(ms);
ms.Position = 0;
await using var loadScope = _provider.CreateAsyncScope();
var importer = loadScope.ServiceProvider.GetRequiredService<IBundleImporter>();
var session = await importer.LoadAsync(ms, passphrase: null);
return session.SessionId;
}
private async Task WipeContentAsync()
{
await using var scope = _provider.CreateAsyncScope();
var ctx = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
ctx.Templates.RemoveRange(ctx.Templates);
ctx.SharedScripts.RemoveRange(ctx.SharedScripts);
ctx.TemplateFolders.RemoveRange(ctx.TemplateFolders);
await ctx.SaveChangesAsync();
}
/// <summary>
/// EF transaction interceptor that throws on rollback when armed. Used by
/// <see cref="ApplyAsync_writes_BundleImportFailed_even_when_RollbackAsync_throws"/>
/// to simulate the connection-dropped-during-rollback scenario. EF calls
/// the async hook from inside <c>IDbContextTransaction.RollbackAsync</c>,
/// so a throw here surfaces as <c>RollbackAsync</c> itself throwing —
/// exactly the contract the catch block must survive.
/// </summary>
private sealed class ThrowingRollbackInterceptor : DbTransactionInterceptor
{
public const string RollbackErrorMarker = "simulated rollback failure";
public bool ThrowOnRollback { get; set; }
public override ValueTask<InterceptionResult> TransactionRollingBackAsync(
DbTransaction transaction,
TransactionEventData eventData,
InterceptionResult result,
CancellationToken cancellationToken = default)
{
if (ThrowOnRollback)
{
throw new InvalidOperationException(RollbackErrorMarker);
}
return base.TransactionRollingBackAsync(transaction, eventData, result, cancellationToken);
}
public override InterceptionResult TransactionRollingBack(
DbTransaction transaction,
TransactionEventData eventData,
InterceptionResult result)
{
if (ThrowOnRollback)
{
throw new InvalidOperationException(RollbackErrorMarker);
}
return base.TransactionRollingBack(transaction, eventData, result);
}
}
}
/// <summary>
/// SQLite-compatible variant of <see cref="ScadaLinkDbContext"/> used by
/// <see cref="BundleImporterRollbackFailureTests"/>. Mirrors the adaptations in
/// <c>SqliteTestDbContext</c> over in ScadaLink.ConfigurationDatabase.Tests
/// (rowversion is nullable, DateTimeOffset stored as ISO 8601 text) but is
/// duplicated here to avoid taking a project reference to that test project.
/// </summary>
internal sealed class SqliteCompatibleScadaLinkDbContext : ScadaLinkDbContext
{
public SqliteCompatibleScadaLinkDbContext(
DbContextOptions<ScadaLinkDbContext> options,
IDataProtectionProvider dataProtectionProvider)
: base(options, dataProtectionProvider)
{
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
base.OnModelCreating(modelBuilder);
modelBuilder.Entity<DeploymentRecord>(builder =>
{
builder.Property(d => d.RowVersion)
.IsRequired(false)
.IsConcurrencyToken(false)
.ValueGeneratedNever();
});
var converter = new ValueConverter<DateTimeOffset, string>(
v => v.UtcDateTime.ToString("o"),
v => DateTimeOffset.Parse(v));
var nullableConverter = new ValueConverter<DateTimeOffset?, string?>(
v => v.HasValue ? v.Value.UtcDateTime.ToString("o") : null,
v => v != null ? DateTimeOffset.Parse(v) : null);
foreach (var entityType in modelBuilder.Model.GetEntityTypes())
{
foreach (var property in entityType.GetProperties())
{
if (property.ClrType == typeof(DateTimeOffset))
{
property.SetValueConverter(converter);
property.SetColumnType("TEXT");
}
else if (property.ClrType == typeof(DateTimeOffset?))
{
property.SetValueConverter(nullableConverter);
property.SetColumnType("TEXT");
}
}
}
}
}