Implement Go-parity background flush loop (coalesce 16KB/8ms) in MsgBlock/FileStore, replace O(n) GetStateAsync with incremental counters, skip PruneExpired/LoadAsync/ PrunePerSubject when not needed, and bypass RAFT for single-replica streams. Fix counter tracking bugs in RemoveMsg/EraseMsg/TTL expiry and ObjectDisposedException races in flush loop disposal. FileStore optimizations verified with 3112/3112 JetStream tests passing; async publish benchmark remains at ~174 msg/s due to E2E protocol path bottleneck.
816 lines
29 KiB
C#
816 lines
29 KiB
C#
// Reference: golang/nats-server/server/filestore.go:217-267 (msgBlock struct)
|
|
// Go block write: filestore.go:6700-6760 (writeMsgRecord / writeMsgRecordLocked)
|
|
// Go block load: filestore.go:8140-8260 (loadMsgs / msgFromBufEx)
|
|
// Go deletion: filestore.go dmap (avl.SequenceSet) for soft-deletes
|
|
// Go sealing: filestore.go rbytes check — block rolls when rbytes >= maxBytes
|
|
// Go write cache: filestore.go msgBlock.cache — recently-written records kept in
|
|
// memory to avoid disk reads on the hot path (cache field, clearCache method).
|
|
//
|
|
// MsgBlock is the unit of storage in the file store. Messages are appended
|
|
// sequentially as binary records (using MessageRecord). Blocks are sealed
|
|
// (read-only) when they reach a configurable size limit.
|
|
|
|
using System.Security.Cryptography;
|
|
using Microsoft.Win32.SafeHandles;
|
|
|
|
namespace NATS.Server.JetStream.Storage;
|
|
|
|
/// <summary>
|
|
/// A block of messages stored in a single append-only file on disk.
|
|
/// This is the unit of storage in the file store. Messages are appended
|
|
/// sequentially as binary records. Blocks become sealed (read-only) when
|
|
/// they reach a configurable byte-size limit.
|
|
/// </summary>
|
|
public sealed class MsgBlock : IDisposable
|
|
{
|
|
private readonly FileStream _file;
|
|
private readonly SafeFileHandle _handle;
|
|
private readonly Dictionary<ulong, (long Offset, int Length)> _index = new();
|
|
// Go: msgBlock.dmap — avl.SequenceSet for sparse deletion tracking.
|
|
// Reference: golang/nats-server/server/avl/seqset.go (SequenceSet).
|
|
// .NET uses a sorted-range list (see SequenceSet.cs) for O(log n) ops with
|
|
// range compression for contiguous deletion runs (TTL, bulk remove).
|
|
private readonly SequenceSet _deleted = new();
|
|
// Go: SkipMsg writes tombstone records with empty subject — tracked separately so
|
|
// recovery can distinguish intentional sequence gaps from soft-deleted messages.
|
|
private readonly SequenceSet _skipSequences = new();
|
|
private readonly long _maxBytes;
|
|
private readonly ReaderWriterLockSlim _lock = new();
|
|
private long _writeOffset; // Tracks the append position independently of FileStream.Position
|
|
private ulong _nextSequence;
|
|
private ulong _firstSequence;
|
|
private ulong _lastSequence;
|
|
private ulong _totalWritten; // Total records written (including later-deleted)
|
|
private bool _disposed;
|
|
|
|
// Go: msgBlock.cache — in-memory write cache for recently-written records.
|
|
// Only the active (last) block maintains a cache; sealed blocks use disk reads.
|
|
// Reference: golang/nats-server/server/filestore.go:236 (cache field)
|
|
private Dictionary<ulong, MessageRecord>? _cache;
|
|
|
|
// Pending write buffer — accumulates encoded records for batched disk writes.
|
|
// The background flush loop in FileStore coalesces these into fewer I/O calls.
|
|
// Reference: golang/nats-server/server/filestore.go:6700 (cache.buf write path).
|
|
private readonly List<(byte[] Data, long Offset)> _pendingWrites = new();
|
|
private int _pendingBytes;
|
|
|
|
// Go: msgBlock.lchk — last written record checksum (XxHash64, 8 bytes).
|
|
// Tracked so callers can chain checksum verification across blocks.
|
|
// Reference: golang/nats-server/server/filestore.go:2204 (lchk field)
|
|
private byte[]? _lastChecksum;
|
|
|
|
private MsgBlock(FileStream file, int blockId, long maxBytes, ulong firstSequence)
|
|
{
|
|
_file = file;
|
|
_handle = file.SafeFileHandle;
|
|
BlockId = blockId;
|
|
_maxBytes = maxBytes;
|
|
_firstSequence = firstSequence;
|
|
_nextSequence = firstSequence;
|
|
_writeOffset = file.Length;
|
|
}
|
|
|
|
/// <summary>Block identifier.</summary>
|
|
public int BlockId { get; }
|
|
|
|
/// <summary>First sequence number in this block.</summary>
|
|
public ulong FirstSequence
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _firstSequence; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>Last sequence number written.</summary>
|
|
public ulong LastSequence
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _lastSequence; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>Total messages excluding deleted.</summary>
|
|
public ulong MessageCount
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _totalWritten - (ulong)_deleted.Count; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>Count of soft-deleted messages. Mirrors Go's msgBlock.dmap.Size().</summary>
|
|
public ulong DeletedCount
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return (ulong)_deleted.Count; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>Total bytes written to block file.</summary>
|
|
public long BytesUsed
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _writeOffset; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>True when BytesUsed >= maxBytes (block is full).</summary>
|
|
public bool IsSealed
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _writeOffset >= _maxBytes; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// True when the write cache is currently populated.
|
|
/// Used by tests to verify cache presence without exposing the cache contents directly.
|
|
/// </summary>
|
|
public bool HasCache
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _cache is not null; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Total bytes of pending (not yet flushed to disk) writes in this block.
|
|
/// Used by the background flush loop to decide when to coalesce.
|
|
/// </summary>
|
|
public int PendingWriteSize
|
|
{
|
|
get
|
|
{
|
|
if (_disposed)
|
|
return 0;
|
|
try { _lock.EnterReadLock(); }
|
|
catch (ObjectDisposedException) { return 0; }
|
|
try { return _pendingBytes; }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// The XxHash64 checksum of the last record written to this block (8 bytes), or null
|
|
/// if no records have been written yet. Updated after every <see cref="Write"/>,
|
|
/// <see cref="WriteAt"/>, or <see cref="WriteSkip"/> call.
|
|
/// Reference: golang/nats-server/server/filestore.go:2204 (msgBlock.lchk)
|
|
/// </summary>
|
|
public byte[]? LastChecksum
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _lastChecksum is null ? null : (byte[])_lastChecksum.Clone(); }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a new empty block file.
|
|
/// </summary>
|
|
/// <param name="blockId">Block identifier.</param>
|
|
/// <param name="directoryPath">Directory to store the block file.</param>
|
|
/// <param name="maxBytes">Size limit before sealing.</param>
|
|
/// <param name="firstSequence">First sequence number (default 1).</param>
|
|
/// <returns>A new <see cref="MsgBlock"/> ready for writes.</returns>
|
|
public static MsgBlock Create(int blockId, string directoryPath, long maxBytes, ulong firstSequence = 1)
|
|
{
|
|
Directory.CreateDirectory(directoryPath);
|
|
var filePath = BlockFilePath(directoryPath, blockId);
|
|
var file = new FileStream(filePath, FileMode.CreateNew, FileAccess.ReadWrite, FileShare.Read);
|
|
return new MsgBlock(file, blockId, maxBytes, firstSequence);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Recovers a block from an existing file, rebuilding the in-memory index.
|
|
/// </summary>
|
|
/// <param name="blockId">Block identifier.</param>
|
|
/// <param name="directoryPath">Directory containing the block file.</param>
|
|
/// <returns>A recovered <see cref="MsgBlock"/>.</returns>
|
|
public static MsgBlock Recover(int blockId, string directoryPath)
|
|
{
|
|
var filePath = BlockFilePath(directoryPath, blockId);
|
|
var file = new FileStream(filePath, FileMode.Open, FileAccess.ReadWrite, FileShare.Read);
|
|
|
|
// We don't know maxBytes from the file alone — use long.MaxValue so
|
|
// the recovered block is effectively unsealed. The caller can re-create
|
|
// with proper limits if needed.
|
|
var block = new MsgBlock(file, blockId, long.MaxValue, firstSequence: 0);
|
|
block.RebuildIndex();
|
|
return block;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Appends a message to the block with an auto-assigned sequence number.
|
|
/// Populates the write cache so subsequent reads can bypass disk.
|
|
/// Reference: golang/nats-server/server/filestore.go:6700 (writeMsgRecord).
|
|
/// </summary>
|
|
/// <param name="subject">NATS subject.</param>
|
|
/// <param name="headers">Optional message headers.</param>
|
|
/// <param name="payload">Message body payload.</param>
|
|
/// <returns>The assigned sequence number.</returns>
|
|
/// <exception cref="InvalidOperationException">Block is sealed.</exception>
|
|
public ulong Write(string subject, ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload)
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
if (_writeOffset >= _maxBytes)
|
|
throw new InvalidOperationException("Block is sealed; cannot write new messages.");
|
|
|
|
var sequence = _nextSequence;
|
|
var record = new MessageRecord
|
|
{
|
|
Sequence = sequence,
|
|
Subject = subject,
|
|
Headers = headers,
|
|
Payload = payload,
|
|
Timestamp = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() * 1_000_000L,
|
|
Deleted = false,
|
|
};
|
|
|
|
var encoded = MessageRecord.Encode(record);
|
|
var offset = _writeOffset;
|
|
|
|
// Buffer the write for batched disk I/O — the background flush loop
|
|
// in FileStore will coalesce pending writes.
|
|
_pendingWrites.Add((encoded, offset));
|
|
_pendingBytes += encoded.Length;
|
|
_writeOffset = offset + encoded.Length;
|
|
|
|
_index[sequence] = (offset, encoded.Length);
|
|
|
|
// Go: cache recently-written record to avoid disk reads on hot path.
|
|
// Reference: golang/nats-server/server/filestore.go:6730 (cache population).
|
|
_cache ??= new Dictionary<ulong, MessageRecord>();
|
|
_cache[sequence] = record;
|
|
|
|
// Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record).
|
|
// Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write)
|
|
_lastChecksum ??= new byte[8];
|
|
encoded.AsSpan(^8..).CopyTo(_lastChecksum);
|
|
|
|
if (_totalWritten == 0)
|
|
_firstSequence = sequence;
|
|
|
|
_lastSequence = sequence;
|
|
_nextSequence = sequence + 1;
|
|
_totalWritten++;
|
|
|
|
return sequence;
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Appends a message to the block with an explicit sequence number and timestamp.
|
|
/// Used by FileStore when rewriting blocks from the in-memory cache where
|
|
/// sequences may have gaps (from prior removals).
|
|
/// Populates the write cache so subsequent reads can bypass disk.
|
|
/// Reference: golang/nats-server/server/filestore.go:6700 (writeMsgRecord).
|
|
/// </summary>
|
|
/// <param name="sequence">Explicit sequence number to assign.</param>
|
|
/// <param name="subject">NATS subject.</param>
|
|
/// <param name="headers">Optional message headers.</param>
|
|
/// <param name="payload">Message body payload.</param>
|
|
/// <param name="timestamp">Timestamp in Unix nanoseconds.</param>
|
|
/// <exception cref="InvalidOperationException">Block is sealed.</exception>
|
|
public void WriteAt(ulong sequence, string subject, ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload, long timestamp)
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
if (_writeOffset >= _maxBytes)
|
|
throw new InvalidOperationException("Block is sealed; cannot write new messages.");
|
|
|
|
var record = new MessageRecord
|
|
{
|
|
Sequence = sequence,
|
|
Subject = subject,
|
|
Headers = headers,
|
|
Payload = payload,
|
|
Timestamp = timestamp,
|
|
Deleted = false,
|
|
};
|
|
|
|
var encoded = MessageRecord.Encode(record);
|
|
var offset = _writeOffset;
|
|
|
|
// Buffer the write for batched disk I/O — the background flush loop
|
|
// in FileStore will coalesce pending writes.
|
|
_pendingWrites.Add((encoded, offset));
|
|
_pendingBytes += encoded.Length;
|
|
_writeOffset = offset + encoded.Length;
|
|
|
|
_index[sequence] = (offset, encoded.Length);
|
|
|
|
// Go: cache recently-written record to avoid disk reads on hot path.
|
|
// Reference: golang/nats-server/server/filestore.go:6730 (cache population).
|
|
_cache ??= new Dictionary<ulong, MessageRecord>();
|
|
_cache[sequence] = record;
|
|
|
|
// Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record).
|
|
// Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write)
|
|
_lastChecksum ??= new byte[8];
|
|
encoded.AsSpan(^8..).CopyTo(_lastChecksum);
|
|
|
|
if (_totalWritten == 0)
|
|
_firstSequence = sequence;
|
|
|
|
_lastSequence = sequence;
|
|
_nextSequence = Math.Max(_nextSequence, sequence + 1);
|
|
_totalWritten++;
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads a message by sequence number.
|
|
/// Checks the write cache first to avoid disk I/O for recently-written messages.
|
|
/// Falls back to positional disk read if the record is not cached.
|
|
/// Reference: golang/nats-server/server/filestore.go:8140 (loadMsgs / msgFromBufEx).
|
|
/// </summary>
|
|
/// <param name="sequence">The sequence number to read.</param>
|
|
/// <returns>The decoded record, or null if not found or deleted.</returns>
|
|
public MessageRecord? Read(ulong sequence)
|
|
{
|
|
// Use a write lock because we may need to flush pending writes.
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
if (_deleted.Contains(sequence))
|
|
return null;
|
|
|
|
// Go: check cache first (msgBlock.cache lookup).
|
|
// Reference: golang/nats-server/server/filestore.go:8155 (cache hit path).
|
|
if (_cache is not null && _cache.TryGetValue(sequence, out var cached))
|
|
return cached;
|
|
|
|
if (!_index.TryGetValue(sequence, out var entry))
|
|
return null;
|
|
|
|
// Flush pending writes so disk reads see the latest data.
|
|
if (_pendingWrites.Count > 0)
|
|
{
|
|
foreach (var (data, off) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, off);
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
}
|
|
|
|
var buffer = new byte[entry.Length];
|
|
RandomAccess.Read(_handle, buffer, entry.Offset);
|
|
|
|
return MessageRecord.Decode(buffer);
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Soft-deletes a message by sequence number. Re-encodes the record on disk
|
|
/// with the deleted flag set (and updated checksum) so the deletion survives recovery.
|
|
/// Also evicts the sequence from the write cache.
|
|
/// When <paramref name="secureErase"/> is <c>true</c>, the payload bytes inside
|
|
/// the encoded record are overwritten with cryptographically random data before
|
|
/// the record is re-written — ensuring the original payload is unrecoverable.
|
|
/// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
|
|
/// </summary>
|
|
/// <param name="sequence">The sequence number to delete.</param>
|
|
/// <param name="secureErase">
|
|
/// When <c>true</c>, payload bytes are filled with random data before the
|
|
/// record is written back. Defaults to <c>false</c>.
|
|
/// </param>
|
|
/// <returns>True if the message was deleted; false if already deleted or not found.</returns>
|
|
public bool Delete(ulong sequence, bool secureErase = false)
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
if (!_index.TryGetValue(sequence, out var entry))
|
|
return false;
|
|
|
|
if (!_deleted.Add(sequence))
|
|
return false;
|
|
|
|
// Flush any pending writes so the record is on disk before we read it back.
|
|
if (_pendingWrites.Count > 0)
|
|
{
|
|
foreach (var (data, off) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, off);
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
}
|
|
|
|
// Read the existing record, re-encode with Deleted flag, write back in-place.
|
|
// The encoded size doesn't change (only flags byte + checksum differ).
|
|
var buffer = new byte[entry.Length];
|
|
RandomAccess.Read(_handle, buffer, entry.Offset);
|
|
var record = MessageRecord.Decode(buffer);
|
|
|
|
ReadOnlyMemory<byte> payload = record.Payload;
|
|
if (secureErase && payload.Length > 0)
|
|
{
|
|
// Go: eraseMsg — overwrite payload region with random bytes so the
|
|
// original content is unrecoverable from disk.
|
|
// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg).
|
|
var randomPayload = new byte[payload.Length];
|
|
RandomNumberGenerator.Fill(randomPayload);
|
|
payload = randomPayload;
|
|
}
|
|
|
|
var deletedRecord = new MessageRecord
|
|
{
|
|
Sequence = record.Sequence,
|
|
Subject = record.Subject,
|
|
Headers = record.Headers,
|
|
Payload = payload,
|
|
Timestamp = record.Timestamp,
|
|
Deleted = true,
|
|
};
|
|
|
|
var encoded = MessageRecord.Encode(deletedRecord);
|
|
RandomAccess.Write(_handle, encoded, entry.Offset);
|
|
|
|
// Evict from write cache — the record is now deleted.
|
|
_cache?.Remove(sequence);
|
|
|
|
return true;
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Writes a skip record for the given sequence number — reserves the sequence
|
|
/// without storing actual message data. The record is written with the Deleted
|
|
/// flag set so recovery skips it when rebuilding the in-memory message cache.
|
|
/// This mirrors Go's SkipMsg tombstone behaviour.
|
|
/// Reference: golang/nats-server/server/filestore.go — SkipMsg.
|
|
/// </summary>
|
|
public void WriteSkip(ulong sequence)
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
if (_writeOffset >= _maxBytes)
|
|
throw new InvalidOperationException("Block is sealed; cannot write skip record.");
|
|
|
|
var now = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() * 1_000_000L;
|
|
var record = new MessageRecord
|
|
{
|
|
Sequence = sequence,
|
|
Subject = string.Empty,
|
|
Headers = ReadOnlyMemory<byte>.Empty,
|
|
Payload = ReadOnlyMemory<byte>.Empty,
|
|
Timestamp = now,
|
|
Deleted = true, // skip = deleted from the start
|
|
};
|
|
|
|
var encoded = MessageRecord.Encode(record);
|
|
var offset = _writeOffset;
|
|
|
|
// Buffer the write for batched disk I/O.
|
|
_pendingWrites.Add((encoded, offset));
|
|
_pendingBytes += encoded.Length;
|
|
_writeOffset = offset + encoded.Length;
|
|
|
|
_index[sequence] = (offset, encoded.Length);
|
|
_deleted.Add(sequence);
|
|
_skipSequences.Add(sequence); // Track skip sequences separately for recovery
|
|
// Note: intentionally NOT added to _cache since it is deleted.
|
|
|
|
// Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record).
|
|
// Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write)
|
|
_lastChecksum ??= new byte[8];
|
|
encoded.AsSpan(^8..).CopyTo(_lastChecksum);
|
|
|
|
if (_totalWritten == 0)
|
|
_firstSequence = sequence;
|
|
|
|
_lastSequence = Math.Max(_lastSequence, sequence);
|
|
_nextSequence = Math.Max(_nextSequence, sequence + 1);
|
|
_totalWritten++;
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Clears the write cache, releasing memory. After this call, all reads will
|
|
/// go to disk. Called when the block is sealed (no longer the active block)
|
|
/// or under memory pressure.
|
|
/// Reference: golang/nats-server/server/filestore.go — clearCache method on msgBlock.
|
|
/// </summary>
|
|
public void ClearCache()
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
_cache = null;
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Flushes all buffered (pending) writes to disk in a single batch.
|
|
/// Called by the background flush loop in FileStore, or synchronously on
|
|
/// block seal / dispose to ensure all data reaches disk.
|
|
/// Reference: golang/nats-server/server/filestore.go:7592 (flushPendingMsgsLocked).
|
|
/// </summary>
|
|
/// <returns>The number of bytes flushed.</returns>
|
|
public int FlushPending()
|
|
{
|
|
if (_disposed)
|
|
return 0;
|
|
|
|
try
|
|
{
|
|
_lock.EnterWriteLock();
|
|
}
|
|
catch (ObjectDisposedException)
|
|
{
|
|
// Block was disposed concurrently (e.g. during PurgeAsync).
|
|
return 0;
|
|
}
|
|
|
|
try
|
|
{
|
|
if (_pendingWrites.Count == 0)
|
|
return 0;
|
|
|
|
foreach (var (data, offset) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, offset);
|
|
|
|
var flushed = _pendingBytes;
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
return flushed;
|
|
}
|
|
finally { _lock.ExitWriteLock(); }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns true if the given sequence number has been soft-deleted in this block.
|
|
/// Reference: golang/nats-server/server/filestore.go — dmap (deleted map) lookup.
|
|
/// </summary>
|
|
public bool IsDeleted(ulong sequence)
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _deleted.Contains(sequence); }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns the maximum skip-sequence written into this block (0 if none).
|
|
/// Skip sequences are intentional tombstones from SkipMsg/SkipMsgs —
|
|
/// they bump _last without storing a live message, so recovery must account
|
|
/// for them when computing the high-water mark.
|
|
/// </summary>
|
|
public ulong MaxSkipSequence
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try
|
|
{
|
|
if (_skipSequences.IsEmpty)
|
|
return 0UL;
|
|
// SequenceSet enumerates in ascending order; last element is max.
|
|
ulong max = 0;
|
|
foreach (var seq in _skipSequences)
|
|
max = seq; // last wins since iteration is ascending
|
|
return max;
|
|
}
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Exposes the set of soft-deleted sequence numbers for read-only inspection.
|
|
/// Returns a snapshot as a <see cref="HashSet{T}"/> so callers can use
|
|
/// standard <see cref="IReadOnlySet{T}"/> operations.
|
|
/// Reference: golang/nats-server/server/filestore.go — dmap access for state queries.
|
|
/// </summary>
|
|
public IReadOnlySet<ulong> DeletedSequences
|
|
{
|
|
get
|
|
{
|
|
_lock.EnterReadLock();
|
|
try { return _deleted.ToHashSet(); }
|
|
finally { _lock.ExitReadLock(); }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Enumerates all non-deleted sequences in this block along with their subjects.
|
|
/// Used by FileStore for subject-filtered operations (PurgeEx, SubjectsState, etc.).
|
|
/// Reference: golang/nats-server/server/filestore.go — loadBlock, iterating non-deleted records.
|
|
/// </summary>
|
|
public IEnumerable<(ulong Sequence, string Subject)> EnumerateNonDeleted()
|
|
{
|
|
// Snapshot index and deleted set under a write lock (may need to flush pending).
|
|
List<(long Offset, int Length, ulong Seq)> entries;
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
// Flush pending writes so disk reads see latest data.
|
|
if (_pendingWrites.Count > 0)
|
|
{
|
|
foreach (var (data, off) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, off);
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
}
|
|
|
|
entries = new List<(long, int, ulong)>(_index.Count);
|
|
foreach (var (seq, (offset, length)) in _index)
|
|
{
|
|
if (!_deleted.Contains(seq))
|
|
entries.Add((offset, length, seq));
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
|
|
// Sort by sequence for deterministic output.
|
|
entries.Sort((a, b) => a.Seq.CompareTo(b.Seq));
|
|
|
|
foreach (var (offset, length, seq) in entries)
|
|
{
|
|
// Check the write cache first to avoid disk I/O.
|
|
_lock.EnterReadLock();
|
|
MessageRecord? cached = null;
|
|
try
|
|
{
|
|
_cache?.TryGetValue(seq, out cached);
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitReadLock();
|
|
}
|
|
|
|
if (cached is not null)
|
|
{
|
|
if (!cached.Deleted)
|
|
yield return (cached.Sequence, cached.Subject);
|
|
continue;
|
|
}
|
|
|
|
var buffer = new byte[length];
|
|
RandomAccess.Read(_handle, buffer, offset);
|
|
var record = MessageRecord.Decode(buffer);
|
|
if (record is not null && !record.Deleted)
|
|
yield return (record.Sequence, record.Subject);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Flushes any pending buffered writes and then syncs the file to disk.
|
|
/// </summary>
|
|
public void Flush()
|
|
{
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
// Flush pending buffered writes first.
|
|
if (_pendingWrites.Count > 0)
|
|
{
|
|
foreach (var (data, offset) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, offset);
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
}
|
|
|
|
_file.Flush(flushToDisk: true);
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Closes the file handle and releases resources.
|
|
/// </summary>
|
|
public void Dispose()
|
|
{
|
|
if (_disposed)
|
|
return;
|
|
_disposed = true;
|
|
|
|
_lock.EnterWriteLock();
|
|
try
|
|
{
|
|
// Flush pending buffered writes before closing.
|
|
if (_pendingWrites.Count > 0)
|
|
{
|
|
foreach (var (data, offset) in _pendingWrites)
|
|
RandomAccess.Write(_handle, data, offset);
|
|
_pendingWrites.Clear();
|
|
_pendingBytes = 0;
|
|
}
|
|
|
|
_file.Flush();
|
|
_file.Dispose();
|
|
}
|
|
finally
|
|
{
|
|
_lock.ExitWriteLock();
|
|
}
|
|
|
|
_lock.Dispose();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Rebuilds the in-memory index by scanning all records in the block file.
|
|
/// Uses <see cref="MessageRecord.MeasureRecord"/> to determine each record's
|
|
/// size before decoding, so trailing data from subsequent records doesn't
|
|
/// corrupt the checksum validation.
|
|
/// </summary>
|
|
private void RebuildIndex()
|
|
{
|
|
var fileLength = _file.Length;
|
|
long offset = 0;
|
|
ulong count = 0;
|
|
|
|
while (offset < fileLength)
|
|
{
|
|
// Read remaining bytes from current offset using positional I/O
|
|
var remaining = (int)(fileLength - offset);
|
|
var buffer = new byte[remaining];
|
|
RandomAccess.Read(_handle, buffer, offset);
|
|
|
|
// Measure the first record's length, then decode only that slice
|
|
var recordLength = MessageRecord.MeasureRecord(buffer);
|
|
var record = MessageRecord.Decode(buffer.AsSpan(0, recordLength));
|
|
|
|
_index[record.Sequence] = (offset, recordLength);
|
|
|
|
if (record.Deleted)
|
|
{
|
|
_deleted.Add(record.Sequence);
|
|
// Empty subject = skip/tombstone record (from SkipMsg/SkipMsgs).
|
|
if (string.IsNullOrEmpty(record.Subject))
|
|
_skipSequences.Add(record.Sequence);
|
|
}
|
|
|
|
if (count == 0)
|
|
_firstSequence = record.Sequence;
|
|
|
|
_lastSequence = record.Sequence;
|
|
_nextSequence = record.Sequence + 1;
|
|
count++;
|
|
|
|
offset += recordLength;
|
|
}
|
|
|
|
_totalWritten = count;
|
|
_writeOffset = offset;
|
|
// Note: recovered blocks do not populate the write cache — reads go to disk.
|
|
// The cache is only populated during active writes on the hot path.
|
|
}
|
|
|
|
private static string BlockFilePath(string directoryPath, int blockId)
|
|
=> Path.Combine(directoryPath, $"{blockId:D6}.blk");
|
|
}
|