fix(historian-gateway): cancellation-safe alarm writer + dispose-safe outbox + provisioner polish + outbox tests

I-1: GatewayAlarmHistorianWriter no longer dead-letters events cancelled
mid-drain at shutdown. WriteBatchAsync short-circuits remaining events to
RetryPlease once cancellation is requested, and SendOneAsync catches
OperationCanceledException (when the token is cancelled) -> RetryPlease,
so in-flight events stay queued instead of being permanently dropped.

I-2: FasterLogHistorizationOutbox.Dispose now guards the awaited periodic
loop with a broad catch (Exception) after the OperationCanceledException
catch, so a non-Faster teardown fault (e.g. ObjectDisposedException) can
never escape Dispose.

M-1: GatewayTagProvisioner skips the empty EnsureTags round-trip when every
request is non-historizable (early return).

M-2: GatewayTagProvisioner handles plain shutdown cancellation quietly
(Debug, not Warning), counting the unsent batch as Failed, never throwing.

M-3/M-4: Added remove-last-entry (TailAddress truncation branch) and
FIFO implicit-ack (RemoveAsync acks up to and including the target)
durability tests, both reopen-and-survive.

M-5: Clarifying comment in RecoverState on the transient over-capacity
rebuild after a crash between append-commit and drop-truncation-commit.

Claude-Session: https://claude.ai/code/session_012SDSQ3AcaXqPcBtDESBRii
This commit is contained in:
Joseph Doherty
2026-06-26 17:47:20 -04:00
parent 0be79219fc
commit 22711444cc
6 changed files with 127 additions and 1 deletions
@@ -174,6 +174,23 @@ public sealed class GatewayAlarmHistorianWriterTests
Assert.Equal(HistorianWriteOutcome.PermanentFail, outcomes[0]);
}
[Fact]
public async Task Cancellation_mid_drain_is_RetryPlease_not_PermanentFail()
{
// Shutdown mid-drain: a cancelled token must NOT dead-letter in-flight events (silent data
// loss). Every outcome is RetryPlease (stays queued for next startup), WriteBatchAsync never
// throws, and the gateway is not called with a cancelled token (short-circuited up front).
using var cts = new CancellationTokenSource();
await cts.CancelAsync();
var fake = new FakeHistorianGatewayClient { SendEventThrows = new OperationCanceledException() };
var outcomes = await Writer(fake).WriteBatchAsync(new[] { Evt("A"), Evt("B") }, cts.Token);
Assert.Equal(2, outcomes.Count);
Assert.All(outcomes, o => Assert.Equal(HistorianWriteOutcome.RetryPlease, o));
Assert.Equal(0, fake.SendEventCallCount);
}
[Fact]
public async Task Empty_batch_returns_empty()
{
@@ -64,11 +64,31 @@ public sealed class GatewayTagProvisionerTests
new[] { new HistorianTagProvisionRequest("Pump1.Name", DriverDataType.String, null, null) },
TestContext.Current.CancellationToken);
Assert.Empty(fake.LastEnsureDefinitions!); // String is deferred → never built into a definition
// String is deferred → never built into a definition, so the empty batch skips the gateway
// round-trip entirely (the call is never made).
Assert.Equal(0, fake.EnsureTagsCallCount);
Assert.Equal(1, result.Requested);
Assert.Equal(1, result.Skipped);
}
[Fact]
public async Task Cancellation_is_quiet_and_not_misreported()
{
// A plain shutdown cancellation must not throw and must not be a scary Warning. The unsent
// batch is counted as Failed (it didn't land) but handled quietly — non-blocking contract.
using var cts = new CancellationTokenSource();
await cts.CancelAsync();
var fake = new FakeHistorianGatewayClient { EnsureTagsThrows = new OperationCanceledException() };
var p = Provisioner(fake);
var result = await p.EnsureTagsAsync(
new[] { new HistorianTagProvisionRequest("Pump1.Temp", DriverDataType.Float32, null, null) },
cts.Token);
Assert.Equal(1, result.Failed); // counted, not thrown
Assert.Equal(0, result.Ensured);
}
[Fact]
public async Task Gateway_failure_is_swallowed_and_counted_not_thrown()
{
@@ -59,6 +59,54 @@ public sealed class FasterLogHistorizationOutboxTests : IDisposable
Assert.Equal(keep, batch[0].Id);
}
[Fact]
public async Task Remove_last_entry_empties_and_survives_restart()
{
// Removing the only entry exercises the node.Next == null ⇒ TailAddress truncation branch:
// the head advances to the tail, the outbox empties, and that empty state must persist across
// a restart (a stale survivor here would silently re-drain an already-acked entry).
var dir = NewTempDir();
var a = E("A", 1);
{
using var o = new FasterLogHistorizationOutbox(dir, HistorizationCommitMode.PerEntry);
await o.AppendAsync(a, TestContext.Current.CancellationToken);
await o.RemoveAsync(a.Id, TestContext.Current.CancellationToken); // ack the only entry
Assert.Equal(0, await o.CountAsync(TestContext.Current.CancellationToken));
}
using var reopened = new FasterLogHistorizationOutbox(dir, HistorizationCommitMode.PerEntry);
Assert.Equal(0, await reopened.CountAsync(TestContext.Current.CancellationToken));
Assert.Empty(await reopened.PeekBatchAsync(10, TestContext.Current.CancellationToken));
}
[Fact]
public async Task Remove_acks_fifo_up_to_and_including_target_and_survives_restart()
{
// FIFO implicit-ack: acking B truncates everything up to AND including B (so A is implicitly
// acked too), leaving only the newer C. This is the documented head-advance semantics in
// RemoveAsync, and it must persist across a restart.
var dir = NewTempDir();
var a = E("A", 1);
var b = E("B", 2);
var c = E("C", 3);
{
using var o = new FasterLogHistorizationOutbox(dir, HistorizationCommitMode.PerEntry);
await o.AppendAsync(a, TestContext.Current.CancellationToken);
await o.AppendAsync(b, TestContext.Current.CancellationToken);
await o.AppendAsync(c, TestContext.Current.CancellationToken);
await o.RemoveAsync(b.Id, TestContext.Current.CancellationToken); // acks A and B, leaves C
Assert.Equal(1, await o.CountAsync(TestContext.Current.CancellationToken));
var remaining = await o.PeekBatchAsync(10, TestContext.Current.CancellationToken);
Assert.Equal(new[] { c.Id }, remaining.Select(e => e.Id));
}
using var reopened = new FasterLogHistorizationOutbox(dir, HistorizationCommitMode.PerEntry);
Assert.Equal(1, await reopened.CountAsync(TestContext.Current.CancellationToken));
var survived = await reopened.PeekBatchAsync(10, TestContext.Current.CancellationToken);
Assert.Equal(new[] { c.Id }, survived.Select(e => e.Id));
}
[Fact]
public async Task Capacity_full_drops_oldest_and_counts()
{