fix(historian-gateway): guard recorder outbox-append failures + retry-success test + Sender capture + mux deregister

I-1: Wrap the OnValueChangedAsync AppendAsync in try/catch so a durable-boundary
failure (e.g. a PerEntry fsync hitting disk-full/I-O error) can no longer propagate
out of the handler and trip Akka supervision into a restart loop. A canceled append
during shutdown returns quietly; any other exception increments a new
_outboxAppendFailures counter, logs a Warning (exception type name only), and drops
the value without recording it or nudging the drain. The counter is surfaced on
RecorderStatus (new OutboxAppendFailures field).

I-2: Strengthen Writer_failure_keeps_entry_for_retry to prove the drain actually ran
— assert the writer was invoked (the fake records even on Succeed=false) AND the
outbox stayed at 1 (RemoveAsync not called), via AwaitAssertAsync.

M-3: Capture Sender before the await in the GetStatus handler, then Tell the reply.

M-4: Add Retry_after_writer_failure_eventually_acks proving the retry -> success ->
ack path; FakeValueWriter gains a FailFirstN option + CallCount (Succeed behaviour
unchanged). Short minBackoff keeps it fast and deterministic (AwaitAssert, no sleep).

M-5: Deregister mux interest on PostStop via DependencyMuxActor.UnregisterInterest,
mirroring VirtualTagActor.PostStop, closing the dead-letter window before Terminated.

Claude-Session: https://claude.ai/code/session_012SDSQ3AcaXqPcBtDESBRii
This commit is contained in:
Joseph Doherty
2026-06-26 18:34:19 -04:00
parent 82124ee4f8
commit 97528c500f
2 changed files with 94 additions and 5 deletions
@@ -68,12 +68,15 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
/// <param name="TotalRecorded">Lifetime count of values appended to the outbox.</param>
/// <param name="DroppedNonNumeric">Lifetime count of values dropped for not being numeric-coercible.</param>
/// <param name="OutboxDropped">Lifetime count of entries the outbox dropped on capacity overflow.</param>
/// <param name="OutboxAppendFailures">Lifetime count of durable-boundary append failures (the value
/// was dropped, not recorded, and the actor stayed alive rather than restart-looping).</param>
/// <param name="LastDrainSucceeded">Whether the most recent drain pass acked cleanly.</param>
public sealed record RecorderStatus(
int QueuedDepth,
long TotalRecorded,
long DroppedNonNumeric,
long OutboxDropped,
long OutboxAppendFailures,
bool LastDrainSucceeded);
private readonly IActorRef _dependencyMux;
@@ -93,6 +96,7 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
private DateTime _nextAllowedDrainUtc = DateTime.MinValue;
private long _totalRecorded;
private long _droppedNonNumeric;
private long _outboxAppendFailures;
private bool _lastDrainSucceeded = true;
/// <summary>Gets or sets the timer scheduler (set by Akka via <see cref="IWithTimers"/>).</summary>
@@ -153,7 +157,14 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
ReceiveAsync<VirtualTagActor.DependencyValueChanged>(OnValueChangedAsync);
Receive<DrainTick>(_ => OnDrainTick());
Receive<DrainResult>(OnDrainResult);
ReceiveAsync<GetStatus>(async _ => Sender.Tell(await BuildStatusAsync().ConfigureAwait(false)));
ReceiveAsync<GetStatus>(async _ =>
{
// Capture Sender before the await: although Akka restores the actor context across awaits,
// capturing first is the robust idiom (Sender after an await is brittle).
IActorRef replyTo = Sender;
RecorderStatus status = await BuildStatusAsync().ConfigureAwait(false);
replyTo.Tell(status);
});
}
/// <inheritdoc />
@@ -169,6 +180,9 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
/// <inheritdoc />
protected override void PostStop()
{
// Drop our mux interest eagerly (mirrors VirtualTagActor.PostStop) so the mux stops fanning to
// us immediately, closing the dead-letter window between this stop and the mux's Terminated.
_dependencyMux.Tell(new DependencyMuxActor.UnregisterInterest(Self));
_lifetime.Cancel();
_lifetime.Dispose();
base.PostStop();
@@ -200,7 +214,28 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
// Durable boundary: append (awaited so appends stay serialized) BEFORE the value is considered
// captured. The outbox drops the oldest entry on capacity overflow and tracks DroppedCount.
await _outbox.AppendAsync(entry, _lifetime.Token).ConfigureAwait(false);
try
{
await _outbox.AppendAsync(entry, _lifetime.Token).ConfigureAwait(false);
}
catch (OperationCanceledException) when (_lifetime.IsCancellationRequested)
{
// Normal shutdown raced the append — not a fault. Drop quietly.
return;
}
catch (Exception ex)
{
// A durable-boundary failure (e.g. a PerEntry fsync hitting disk-full / I/O error) must NEVER
// propagate out of the handler — that would trip Akka supervision into a restart, and under a
// persistent disk fault the actor would restart-loop (re-register → next value → append fails
// → restart → …). Mirror the drain path's catch-all: meter the failure (category only, no
// value content), drop this value, and stay alive. Do NOT record it or nudge the drain.
_outboxAppendFailures++;
_log.Warning("ContinuousHistorization: outbox append failed ({Exception}); value dropped.",
ex.GetType().Name);
return;
}
_totalRecorded++;
// Nudge a prompt drain attempt; the DrainTick handler de-dups (already draining) and honours
@@ -326,6 +361,7 @@ public sealed class ContinuousHistorizationRecorder : ReceiveActor, IWithTimers
_totalRecorded,
_droppedNonNumeric,
_outbox.DroppedCount,
_outboxAppendFailures,
_lastDrainSucceeded);
}