feat: add SequenceSet for sparse deletion tracking with secure erase (Gap 1.7)

Replace HashSet<ulong> _deleted in MsgBlock with SequenceSet — a sorted-range
list that compresses contiguous deletions into (Start, End) intervals. Adds
O(log n) Contains/Add via binary search on range count, matching Go's avl.SequenceSet
semantics with a simpler implementation.

- Add SequenceSet.cs: sorted-range compressed set with Add/Remove/Contains/Count/Clear
  and IEnumerable<ulong> in ascending order. Binary search for all O(log n) ops.
- Replace HashSet<ulong> _deleted and _skipSequences in MsgBlock with SequenceSet.
- Add secureErase parameter (default false) to MsgBlock.Delete(): when true, payload
  bytes are overwritten with RandomNumberGenerator.Fill() before the delete record is
  written, making original content unrecoverable on disk.
- Update FileStore.DeleteInBlock() to propagate secureErase flag.
- Update FileStore.EraseMsg() to use secureErase: true via block layer instead of
  delegating to RemoveMsg().
- Add SequenceSetTests.cs: 25 tests covering Add, Remove, Contains, Count, range
  compression, gap filling, bridge merges, enumeration, boundary values, round-trip.
- Add FileStoreTombstoneTrackingTests.cs: 12 tests covering SequenceSet tracking in
  MsgBlock, tombstone persistence through RebuildIndex recovery, secure erase
  payload overwrite verification, and FileStore.EraseMsg integration.

Go reference: filestore.go:5267 (removeMsg), filestore.go:5890 (eraseMsg),
              avl/seqset.go (SequenceSet).
This commit is contained in:
Joseph Doherty
2026-02-25 08:02:44 -05:00
parent 646a5eb2ae
commit cbe41d0efb
5 changed files with 1045 additions and 14 deletions

View File

@@ -732,14 +732,17 @@ public sealed class FileStore : IStreamStore, IAsyncDisposable, IDisposable
/// <summary>
/// Soft-deletes a message in the block that contains it.
/// When <paramref name="secureErase"/> is <c>true</c>, payload bytes are
/// overwritten with random data before the delete record is written.
/// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
/// </summary>
private void DeleteInBlock(ulong sequence)
private void DeleteInBlock(ulong sequence, bool secureErase = false)
{
foreach (var block in _blocks)
{
if (sequence >= block.FirstSequence && sequence <= block.LastSequence)
{
block.Delete(sequence);
block.Delete(sequence, secureErase);
return;
}
}
@@ -1386,15 +1389,25 @@ public sealed class FileStore : IStreamStore, IAsyncDisposable, IDisposable
}
/// <summary>
/// Overwrites a message with zeros and then soft-deletes it.
/// Secure-erases a message: overwrites its payload bytes with random data on disk,
/// then soft-deletes it (same in-memory semantics as <see cref="RemoveMsg"/>).
/// Returns <c>true</c> if the sequence existed and was erased.
/// Reference: golang/nats-server/server/filestore.go — EraseMsg.
/// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
/// </summary>
public bool EraseMsg(ulong seq)
{
// In .NET we don't do physical overwrite — just remove from the in-memory
// cache and soft-delete in the block layer (same semantics as RemoveMsg).
return RemoveMsg(seq);
if (!_messages.Remove(seq, out _))
return false;
if (_messages.Count == 0)
_first = _last + 1;
else
_first = _messages.Keys.Min();
// Secure erase: overwrite payload bytes with random data before marking deleted.
// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
DeleteInBlock(seq, secureErase: true);
return true;
}
/// <summary>

View File

@@ -10,6 +10,7 @@
// sequentially as binary records (using MessageRecord). Blocks are sealed
// (read-only) when they reach a configurable size limit.
using System.Security.Cryptography;
using Microsoft.Win32.SafeHandles;
namespace NATS.Server.JetStream.Storage;
@@ -25,10 +26,14 @@ public sealed class MsgBlock : IDisposable
private readonly FileStream _file;
private readonly SafeFileHandle _handle;
private readonly Dictionary<ulong, (long Offset, int Length)> _index = new();
private readonly HashSet<ulong> _deleted = new();
// Go: msgBlock.dmap — avl.SequenceSet for sparse deletion tracking.
// Reference: golang/nats-server/server/avl/seqset.go (SequenceSet).
// .NET uses a sorted-range list (see SequenceSet.cs) for O(log n) ops with
// range compression for contiguous deletion runs (TTL, bulk remove).
private readonly SequenceSet _deleted = new();
// Go: SkipMsg writes tombstone records with empty subject — tracked separately so
// recovery can distinguish intentional sequence gaps from soft-deleted messages.
private readonly HashSet<ulong> _skipSequences = new();
private readonly SequenceSet _skipSequences = new();
private readonly long _maxBytes;
private readonly ReaderWriterLockSlim _lock = new();
private long _writeOffset; // Tracks the append position independently of FileStream.Position
@@ -95,7 +100,7 @@ public sealed class MsgBlock : IDisposable
}
}
/// <summary>Count of soft-deleted messages.</summary>
/// <summary>Count of soft-deleted messages. Mirrors Go's msgBlock.dmap.Size().</summary>
public ulong DeletedCount
{
get
@@ -357,10 +362,18 @@ public sealed class MsgBlock : IDisposable
/// Soft-deletes a message by sequence number. Re-encodes the record on disk
/// with the deleted flag set (and updated checksum) so the deletion survives recovery.
/// Also evicts the sequence from the write cache.
/// When <paramref name="secureErase"/> is <c>true</c>, the payload bytes inside
/// the encoded record are overwritten with cryptographically random data before
/// the record is re-written — ensuring the original payload is unrecoverable.
/// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
/// </summary>
/// <param name="sequence">The sequence number to delete.</param>
/// <param name="secureErase">
/// When <c>true</c>, payload bytes are filled with random data before the
/// record is written back. Defaults to <c>false</c>.
/// </param>
/// <returns>True if the message was deleted; false if already deleted or not found.</returns>
public bool Delete(ulong sequence)
public bool Delete(ulong sequence, bool secureErase = false)
{
_lock.EnterWriteLock();
try
@@ -377,12 +390,23 @@ public sealed class MsgBlock : IDisposable
RandomAccess.Read(_handle, buffer, entry.Offset);
var record = MessageRecord.Decode(buffer);
ReadOnlyMemory<byte> payload = record.Payload;
if (secureErase && payload.Length > 0)
{
// Go: eraseMsg — overwrite payload region with random bytes so the
// original content is unrecoverable from disk.
// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
var randomPayload = new byte[payload.Length];
RandomNumberGenerator.Fill(randomPayload);
payload = randomPayload;
}
var deletedRecord = new MessageRecord
{
Sequence = record.Sequence,
Subject = record.Subject,
Headers = record.Headers,
Payload = record.Payload,
Payload = payload,
Timestamp = record.Timestamp,
Deleted = true,
};
@@ -498,13 +522,24 @@ public sealed class MsgBlock : IDisposable
get
{
_lock.EnterReadLock();
try { return _skipSequences.Count > 0 ? _skipSequences.Max() : 0UL; }
try
{
if (_skipSequences.IsEmpty)
return 0UL;
// SequenceSet enumerates in ascending order; last element is max.
ulong max = 0;
foreach (var seq in _skipSequences)
max = seq; // last wins since iteration is ascending
return max;
}
finally { _lock.ExitReadLock(); }
}
}
/// <summary>
/// Exposes the set of soft-deleted sequence numbers for read-only inspection.
/// Returns a snapshot as a <see cref="HashSet{T}"/> so callers can use
/// standard <see cref="IReadOnlySet{T}"/> operations.
/// Reference: golang/nats-server/server/filestore.go — dmap access for state queries.
/// </summary>
public IReadOnlySet<ulong> DeletedSequences
@@ -512,7 +547,7 @@ public sealed class MsgBlock : IDisposable
get
{
_lock.EnterReadLock();
try { return new HashSet<ulong>(_deleted); }
try { return _deleted.ToHashSet(); }
finally { _lock.ExitReadLock(); }
}
}

View File

@@ -0,0 +1,230 @@
// Reference: golang/nats-server/server/avl/seqset.go
// Go uses an AVL tree with bitmask nodes (2048 sequences per node).
// .NET port uses a sorted list of (Start, End) ranges — simpler, still O(log n)
// via binary search for the common case of mostly-sequential deletions in JetStream.
//
// Range compression: adding sequences 1, 2, 3 stores as a single range [1, 3].
// This is significantly more memory efficient than HashSet<ulong> for contiguous
// deletion runs (e.g. TTL expiry, bulk removes), which are the dominant pattern.
//
// Not thread-safe — callers must hold their own lock (MsgBlock uses ReaderWriterLockSlim).
namespace NATS.Server.JetStream.Storage;
/// <summary>
/// A memory-efficient sparse set for storing unsigned sequence numbers, using
/// range compression to merge contiguous sequences into (Start, End) intervals.
///
/// Analogous to Go's <c>avl.SequenceSet</c> but implemented with a sorted list
/// of ranges for simplicity. Binary search gives O(log n) Contains/Add on the
/// number of distinct ranges (not the total count of sequences).
///
/// Reference: golang/nats-server/server/avl/seqset.go (SequenceSet struct).
/// </summary>
internal sealed class SequenceSet : IEnumerable<ulong>
{
// Sorted list of non-overlapping, non-adjacent ranges in ascending order.
// Invariant: for all i, ranges[i].End + 1 < ranges[i+1].Start (strict gap between consecutive ranges).
private readonly List<(ulong Start, ulong End)> _ranges = [];
/// <summary>Total number of sequences across all ranges.</summary>
public int Count
{
get
{
var total = 0;
foreach (var (start, end) in _ranges)
total += (int)(end - start + 1);
return total;
}
}
/// <summary>True when the set contains no sequences.</summary>
public bool IsEmpty => _ranges.Count == 0;
/// <summary>
/// Adds <paramref name="seq"/> to the set.
/// Merges adjacent or overlapping ranges automatically.
/// Returns <c>true</c> if the sequence was not already present.
/// Reference: golang/nats-server/server/avl/seqset.go:44 (Insert).
/// </summary>
public bool Add(ulong seq)
{
// Strategy: find the position where seq belongs (binary search by Start),
// then check if seq is already in the previous range or the range at that position,
// and determine left/right adjacency for merging.
// Binary search: find the first index i where _ranges[i].Start > seq.
// Everything before i has Start <= seq. The range that might contain seq
// is at i-1 (if it exists and its End >= seq).
var lo = 0;
var hi = _ranges.Count; // exclusive upper bound
while (lo < hi)
{
var mid = lo + (hi - lo) / 2;
if (_ranges[mid].Start <= seq)
lo = mid + 1;
else
hi = mid;
}
// lo == first index where Start > seq, so lo-1 is the last range with Start <= seq.
var leftIdx = lo - 1; // may be -1 if no range has Start <= seq
// Check if seq is inside the range at leftIdx.
if (leftIdx >= 0 && _ranges[leftIdx].End >= seq)
return false; // already present
// At this point seq is not in any range. Determine insertion context.
// leftIdx: the range immediately to the left of seq (Start <= seq, End < seq)
// rightIdx = lo: the range immediately to the right of seq (Start > seq)
var rightIdx = lo;
// Check adjacency with left neighbor (End + 1 == seq, i.e. seq extends the right edge).
// Safe: if End == ulong.MaxValue the range would already contain seq.
var leftAdjacent = leftIdx >= 0 && _ranges[leftIdx].End + 1 == seq;
// Check adjacency with right neighbor (Start - 1 == seq, i.e. seq extends the left edge).
// Need to guard against Start == 0 (underflow), but if Start == 0 then Start > seq is impossible
// since seq >= 0 and Start <= seq would have been caught by leftIdx check.
var rightAdjacent = rightIdx < _ranges.Count && _ranges[rightIdx].Start > 0
&& _ranges[rightIdx].Start - 1 == seq;
if (leftAdjacent && rightAdjacent)
{
// seq bridges the gap between the two neighbors → merge into one range.
var newStart = _ranges[leftIdx].Start;
var newEnd = _ranges[rightIdx].End;
_ranges.RemoveAt(rightIdx); // remove right neighbor first (higher index)
_ranges[leftIdx] = (newStart, newEnd);
}
else if (leftAdjacent)
{
// Extend the left neighbor's right edge to include seq.
_ranges[leftIdx] = (_ranges[leftIdx].Start, seq);
}
else if (rightAdjacent)
{
// Extend the right neighbor's left edge to include seq.
_ranges[rightIdx] = (seq, _ranges[rightIdx].End);
}
else
{
// No adjacency — insert a new single-element range at the correct position.
_ranges.Insert(rightIdx, (seq, seq));
}
return true;
}
/// <summary>
/// Removes <paramref name="seq"/> from the set.
/// Splits ranges if necessary.
/// Returns <c>true</c> if the sequence was present.
/// Reference: golang/nats-server/server/avl/seqset.go:80 (Delete).
/// </summary>
public bool Remove(ulong seq)
{
// Binary search for the range that contains seq.
var lo = 0;
var hi = _ranges.Count - 1;
while (lo <= hi)
{
var mid = lo + (hi - lo) / 2;
var (rs, re) = _ranges[mid];
if (seq < rs)
hi = mid - 1;
else if (seq > re)
lo = mid + 1;
else
{
// Found the range [rs, re] that contains seq.
if (rs == re)
{
// Single-element range → remove entirely.
_ranges.RemoveAt(mid);
}
else if (seq == rs)
{
// Trim left edge.
_ranges[mid] = (rs + 1, re);
}
else if (seq == re)
{
// Trim right edge.
_ranges[mid] = (rs, re - 1);
}
else
{
// Split: [rs, seq-1] and [seq+1, re].
_ranges[mid] = (seq + 1, re);
_ranges.Insert(mid, (rs, seq - 1));
}
return true;
}
}
return false;
}
/// <summary>
/// Returns <c>true</c> if <paramref name="seq"/> is a member of the set.
/// Binary search: O(log R) where R is the number of distinct ranges.
/// Reference: golang/nats-server/server/avl/seqset.go:52 (Exists).
/// </summary>
public bool Contains(ulong seq)
{
var lo = 0;
var hi = _ranges.Count - 1;
while (lo <= hi)
{
var mid = lo + (hi - lo) / 2;
var (rs, re) = _ranges[mid];
if (seq < rs)
hi = mid - 1;
else if (seq > re)
lo = mid + 1;
else
return true;
}
return false;
}
/// <summary>
/// Removes all sequences from the set.
/// Reference: golang/nats-server/server/avl/seqset.go:107 (Empty).
/// </summary>
public void Clear() => _ranges.Clear();
/// <summary>
/// Copies all sequences in this set into a new <see cref="HashSet{T}"/>.
/// Used when callers require an <see cref="IReadOnlySet{T}"/> snapshot.
/// </summary>
public HashSet<ulong> ToHashSet()
{
var set = new HashSet<ulong>(Count);
foreach (var seq in this)
set.Add(seq);
return set;
}
/// <summary>
/// Returns the number of distinct compressed ranges stored internally.
/// </summary>
internal int RangeCount => _ranges.Count;
/// <summary>
/// Enumerates all sequences in ascending order.
/// Reference: golang/nats-server/server/avl/seqset.go:122 (Range).
/// </summary>
public IEnumerator<ulong> GetEnumerator()
{
foreach (var (start, end) in _ranges)
for (var seq = start; seq <= end; seq++)
{
yield return seq;
if (seq == ulong.MaxValue) yield break; // prevent overflow
}
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
=> GetEnumerator();
}