// Reference: golang/nats-server/server/filestore.go:217-267 (msgBlock struct) // Go block write: filestore.go:6700-6760 (writeMsgRecord / writeMsgRecordLocked) // Go block load: filestore.go:8140-8260 (loadMsgs / msgFromBufEx) // Go deletion: filestore.go dmap (avl.SequenceSet) for soft-deletes // Go sealing: filestore.go rbytes check — block rolls when rbytes >= maxBytes // Go write cache: filestore.go msgBlock.cache — recently-written records kept in // memory to avoid disk reads on the hot path (cache field, clearCache method). // // MsgBlock is the unit of storage in the file store. Messages are appended // sequentially as binary records (using MessageRecord). Blocks are sealed // (read-only) when they reach a configurable size limit. using System.Security.Cryptography; using Microsoft.Win32.SafeHandles; namespace NATS.Server.JetStream.Storage; /// /// A block of messages stored in a single append-only file on disk. /// This is the unit of storage in the file store. Messages are appended /// sequentially as binary records. Blocks become sealed (read-only) when /// they reach a configurable byte-size limit. /// public sealed class MsgBlock : IDisposable { private readonly FileStream _file; private readonly SafeFileHandle _handle; private readonly Dictionary _index = new(); // Go: msgBlock.dmap — avl.SequenceSet for sparse deletion tracking. // Reference: golang/nats-server/server/avl/seqset.go (SequenceSet). // .NET uses a sorted-range list (see SequenceSet.cs) for O(log n) ops with // range compression for contiguous deletion runs (TTL, bulk remove). private readonly SequenceSet _deleted = new(); // Go: SkipMsg writes tombstone records with empty subject — tracked separately so // recovery can distinguish intentional sequence gaps from soft-deleted messages. private readonly SequenceSet _skipSequences = new(); private readonly long _maxBytes; private readonly ReaderWriterLockSlim _lock = new(); private long _writeOffset; // Tracks the append position independently of FileStream.Position private ulong _nextSequence; private ulong _firstSequence; private ulong _lastSequence; private ulong _totalWritten; // Total records written (including later-deleted) private bool _disposed; // Go: msgBlock.cache — in-memory write cache for recently-written records. // Only the active (last) block maintains a cache; sealed blocks use disk reads. // Reference: golang/nats-server/server/filestore.go:236 (cache field) private Dictionary? _cache; // Pending write buffer — accumulates encoded records for batched disk writes. // The background flush loop in FileStore coalesces these into fewer I/O calls. // Reference: golang/nats-server/server/filestore.go:6700 (cache.buf write path). private readonly List<(byte[] Data, long Offset)> _pendingWrites = new(); private int _pendingBytes; // Go: msgBlock.lchk — last written record checksum (XxHash64, 8 bytes). // Tracked so callers can chain checksum verification across blocks. // Reference: golang/nats-server/server/filestore.go:2204 (lchk field) private byte[]? _lastChecksum; private MsgBlock(FileStream file, int blockId, long maxBytes, ulong firstSequence) { _file = file; _handle = file.SafeFileHandle; BlockId = blockId; _maxBytes = maxBytes; _firstSequence = firstSequence; _nextSequence = firstSequence; _writeOffset = file.Length; } /// Block identifier. public int BlockId { get; } /// First sequence number in this block. public ulong FirstSequence { get { _lock.EnterReadLock(); try { return _firstSequence; } finally { _lock.ExitReadLock(); } } } /// Last sequence number written. public ulong LastSequence { get { _lock.EnterReadLock(); try { return _lastSequence; } finally { _lock.ExitReadLock(); } } } /// Total messages excluding deleted. public ulong MessageCount { get { _lock.EnterReadLock(); try { return _totalWritten - (ulong)_deleted.Count; } finally { _lock.ExitReadLock(); } } } /// Count of soft-deleted messages. Mirrors Go's msgBlock.dmap.Size(). public ulong DeletedCount { get { _lock.EnterReadLock(); try { return (ulong)_deleted.Count; } finally { _lock.ExitReadLock(); } } } /// Total bytes written to block file. public long BytesUsed { get { _lock.EnterReadLock(); try { return _writeOffset; } finally { _lock.ExitReadLock(); } } } /// True when BytesUsed >= maxBytes (block is full). public bool IsSealed { get { _lock.EnterReadLock(); try { return _writeOffset >= _maxBytes; } finally { _lock.ExitReadLock(); } } } /// /// True when the write cache is currently populated. /// Used by tests to verify cache presence without exposing the cache contents directly. /// public bool HasCache { get { _lock.EnterReadLock(); try { return _cache is not null; } finally { _lock.ExitReadLock(); } } } /// /// Total bytes of pending (not yet flushed to disk) writes in this block. /// Used by the background flush loop to decide when to coalesce. /// public int PendingWriteSize { get { if (_disposed) return 0; try { _lock.EnterReadLock(); } catch (ObjectDisposedException) { return 0; } try { return _pendingBytes; } finally { _lock.ExitReadLock(); } } } /// /// The XxHash64 checksum of the last record written to this block (8 bytes), or null /// if no records have been written yet. Updated after every , /// , or call. /// Reference: golang/nats-server/server/filestore.go:2204 (msgBlock.lchk) /// public byte[]? LastChecksum { get { _lock.EnterReadLock(); try { return _lastChecksum is null ? null : (byte[])_lastChecksum.Clone(); } finally { _lock.ExitReadLock(); } } } /// /// Creates a new empty block file. /// /// Block identifier. /// Directory to store the block file. /// Size limit before sealing. /// First sequence number (default 1). /// A new ready for writes. public static MsgBlock Create(int blockId, string directoryPath, long maxBytes, ulong firstSequence = 1) { Directory.CreateDirectory(directoryPath); var filePath = BlockFilePath(directoryPath, blockId); var file = new FileStream(filePath, FileMode.CreateNew, FileAccess.ReadWrite, FileShare.Read); return new MsgBlock(file, blockId, maxBytes, firstSequence); } /// /// Recovers a block from an existing file, rebuilding the in-memory index. /// /// Block identifier. /// Directory containing the block file. /// A recovered . public static MsgBlock Recover(int blockId, string directoryPath) { var filePath = BlockFilePath(directoryPath, blockId); var file = new FileStream(filePath, FileMode.Open, FileAccess.ReadWrite, FileShare.Read); // We don't know maxBytes from the file alone — use long.MaxValue so // the recovered block is effectively unsealed. The caller can re-create // with proper limits if needed. var block = new MsgBlock(file, blockId, long.MaxValue, firstSequence: 0); block.RebuildIndex(); return block; } /// /// Appends a message to the block with an auto-assigned sequence number. /// Populates the write cache so subsequent reads can bypass disk. /// Reference: golang/nats-server/server/filestore.go:6700 (writeMsgRecord). /// /// NATS subject. /// Optional message headers. /// Message body payload. /// The assigned sequence number. /// Block is sealed. public ulong Write(string subject, ReadOnlyMemory headers, ReadOnlyMemory payload) { _lock.EnterWriteLock(); try { if (_writeOffset >= _maxBytes) throw new InvalidOperationException("Block is sealed; cannot write new messages."); var sequence = _nextSequence; var record = new MessageRecord { Sequence = sequence, Subject = subject, Headers = headers, Payload = payload, Timestamp = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() * 1_000_000L, Deleted = false, }; var encoded = MessageRecord.Encode(record); var offset = _writeOffset; // Buffer the write for batched disk I/O — the background flush loop // in FileStore will coalesce pending writes. _pendingWrites.Add((encoded, offset)); _pendingBytes += encoded.Length; _writeOffset = offset + encoded.Length; _index[sequence] = (offset, encoded.Length); // Go: cache recently-written record to avoid disk reads on hot path. // Reference: golang/nats-server/server/filestore.go:6730 (cache population). _cache ??= new Dictionary(); _cache[sequence] = record; // Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record). // Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write) _lastChecksum ??= new byte[8]; encoded.AsSpan(^8..).CopyTo(_lastChecksum); if (_totalWritten == 0) _firstSequence = sequence; _lastSequence = sequence; _nextSequence = sequence + 1; _totalWritten++; return sequence; } finally { _lock.ExitWriteLock(); } } /// /// Appends a message to the block with an explicit sequence number and timestamp. /// Used by FileStore when rewriting blocks from the in-memory cache where /// sequences may have gaps (from prior removals). /// Populates the write cache so subsequent reads can bypass disk. /// Reference: golang/nats-server/server/filestore.go:6700 (writeMsgRecord). /// /// Explicit sequence number to assign. /// NATS subject. /// Optional message headers. /// Message body payload. /// Timestamp in Unix nanoseconds. /// Block is sealed. public void WriteAt(ulong sequence, string subject, ReadOnlyMemory headers, ReadOnlyMemory payload, long timestamp) { _lock.EnterWriteLock(); try { if (_writeOffset >= _maxBytes) throw new InvalidOperationException("Block is sealed; cannot write new messages."); var record = new MessageRecord { Sequence = sequence, Subject = subject, Headers = headers, Payload = payload, Timestamp = timestamp, Deleted = false, }; var encoded = MessageRecord.Encode(record); var offset = _writeOffset; // Buffer the write for batched disk I/O — the background flush loop // in FileStore will coalesce pending writes. _pendingWrites.Add((encoded, offset)); _pendingBytes += encoded.Length; _writeOffset = offset + encoded.Length; _index[sequence] = (offset, encoded.Length); // Go: cache recently-written record to avoid disk reads on hot path. // Reference: golang/nats-server/server/filestore.go:6730 (cache population). _cache ??= new Dictionary(); _cache[sequence] = record; // Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record). // Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write) _lastChecksum ??= new byte[8]; encoded.AsSpan(^8..).CopyTo(_lastChecksum); if (_totalWritten == 0) _firstSequence = sequence; _lastSequence = sequence; _nextSequence = Math.Max(_nextSequence, sequence + 1); _totalWritten++; } finally { _lock.ExitWriteLock(); } } /// /// Reads a message by sequence number. /// Checks the write cache first to avoid disk I/O for recently-written messages. /// Falls back to positional disk read if the record is not cached. /// Reference: golang/nats-server/server/filestore.go:8140 (loadMsgs / msgFromBufEx). /// /// The sequence number to read. /// The decoded record, or null if not found or deleted. public MessageRecord? Read(ulong sequence) { // Use a write lock because we may need to flush pending writes. _lock.EnterWriteLock(); try { if (_deleted.Contains(sequence)) return null; // Go: check cache first (msgBlock.cache lookup). // Reference: golang/nats-server/server/filestore.go:8155 (cache hit path). if (_cache is not null && _cache.TryGetValue(sequence, out var cached)) return cached; if (!_index.TryGetValue(sequence, out var entry)) return null; // Flush pending writes so disk reads see the latest data. if (_pendingWrites.Count > 0) { foreach (var (data, off) in _pendingWrites) RandomAccess.Write(_handle, data, off); _pendingWrites.Clear(); _pendingBytes = 0; } var buffer = new byte[entry.Length]; RandomAccess.Read(_handle, buffer, entry.Offset); return MessageRecord.Decode(buffer); } finally { _lock.ExitWriteLock(); } } /// /// Soft-deletes a message by sequence number. Re-encodes the record on disk /// with the deleted flag set (and updated checksum) so the deletion survives recovery. /// Also evicts the sequence from the write cache. /// When is true, the payload bytes inside /// the encoded record are overwritten with cryptographically random data before /// the record is re-written — ensuring the original payload is unrecoverable. /// Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg). /// /// The sequence number to delete. /// /// When true, payload bytes are filled with random data before the /// record is written back. Defaults to false. /// /// True if the message was deleted; false if already deleted or not found. public bool Delete(ulong sequence, bool secureErase = false) { _lock.EnterWriteLock(); try { if (!_index.TryGetValue(sequence, out var entry)) return false; if (!_deleted.Add(sequence)) return false; // Flush any pending writes so the record is on disk before we read it back. if (_pendingWrites.Count > 0) { foreach (var (data, off) in _pendingWrites) RandomAccess.Write(_handle, data, off); _pendingWrites.Clear(); _pendingBytes = 0; } // Read the existing record, re-encode with Deleted flag, write back in-place. // The encoded size doesn't change (only flags byte + checksum differ). var buffer = new byte[entry.Length]; RandomAccess.Read(_handle, buffer, entry.Offset); var record = MessageRecord.Decode(buffer); ReadOnlyMemory payload = record.Payload; if (secureErase && payload.Length > 0) { // Go: eraseMsg — overwrite payload region with random bytes so the // original content is unrecoverable from disk. // Reference: golang/nats-server/server/filestore.go:5890 (eraseMsg). var randomPayload = new byte[payload.Length]; RandomNumberGenerator.Fill(randomPayload); payload = randomPayload; } var deletedRecord = new MessageRecord { Sequence = record.Sequence, Subject = record.Subject, Headers = record.Headers, Payload = payload, Timestamp = record.Timestamp, Deleted = true, }; var encoded = MessageRecord.Encode(deletedRecord); RandomAccess.Write(_handle, encoded, entry.Offset); // Evict from write cache — the record is now deleted. _cache?.Remove(sequence); return true; } finally { _lock.ExitWriteLock(); } } /// /// Writes a skip record for the given sequence number — reserves the sequence /// without storing actual message data. The record is written with the Deleted /// flag set so recovery skips it when rebuilding the in-memory message cache. /// This mirrors Go's SkipMsg tombstone behaviour. /// Reference: golang/nats-server/server/filestore.go — SkipMsg. /// public void WriteSkip(ulong sequence) { _lock.EnterWriteLock(); try { if (_writeOffset >= _maxBytes) throw new InvalidOperationException("Block is sealed; cannot write skip record."); var now = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds() * 1_000_000L; var record = new MessageRecord { Sequence = sequence, Subject = string.Empty, Headers = ReadOnlyMemory.Empty, Payload = ReadOnlyMemory.Empty, Timestamp = now, Deleted = true, // skip = deleted from the start }; var encoded = MessageRecord.Encode(record); var offset = _writeOffset; // Buffer the write for batched disk I/O. _pendingWrites.Add((encoded, offset)); _pendingBytes += encoded.Length; _writeOffset = offset + encoded.Length; _index[sequence] = (offset, encoded.Length); _deleted.Add(sequence); _skipSequences.Add(sequence); // Track skip sequences separately for recovery // Note: intentionally NOT added to _cache since it is deleted. // Go: msgBlock.lchk — capture checksum (last 8 bytes of encoded record). // Reference: golang/nats-server/server/filestore.go:2204 (lchk update on write) _lastChecksum ??= new byte[8]; encoded.AsSpan(^8..).CopyTo(_lastChecksum); if (_totalWritten == 0) _firstSequence = sequence; _lastSequence = Math.Max(_lastSequence, sequence); _nextSequence = Math.Max(_nextSequence, sequence + 1); _totalWritten++; } finally { _lock.ExitWriteLock(); } } /// /// Clears the write cache, releasing memory. After this call, all reads will /// go to disk. Called when the block is sealed (no longer the active block) /// or under memory pressure. /// Reference: golang/nats-server/server/filestore.go — clearCache method on msgBlock. /// public void ClearCache() { _lock.EnterWriteLock(); try { _cache = null; } finally { _lock.ExitWriteLock(); } } /// /// Flushes all buffered (pending) writes to disk in a single batch. /// Called by the background flush loop in FileStore, or synchronously on /// block seal / dispose to ensure all data reaches disk. /// Reference: golang/nats-server/server/filestore.go:7592 (flushPendingMsgsLocked). /// /// The number of bytes flushed. public int FlushPending() { if (_disposed) return 0; try { _lock.EnterWriteLock(); } catch (ObjectDisposedException) { // Block was disposed concurrently (e.g. during PurgeAsync). return 0; } try { if (_pendingWrites.Count == 0) return 0; foreach (var (data, offset) in _pendingWrites) RandomAccess.Write(_handle, data, offset); var flushed = _pendingBytes; _pendingWrites.Clear(); _pendingBytes = 0; return flushed; } finally { _lock.ExitWriteLock(); } } /// /// Returns true if the given sequence number has been soft-deleted in this block. /// Reference: golang/nats-server/server/filestore.go — dmap (deleted map) lookup. /// public bool IsDeleted(ulong sequence) { _lock.EnterReadLock(); try { return _deleted.Contains(sequence); } finally { _lock.ExitReadLock(); } } /// /// Returns the maximum skip-sequence written into this block (0 if none). /// Skip sequences are intentional tombstones from SkipMsg/SkipMsgs — /// they bump _last without storing a live message, so recovery must account /// for them when computing the high-water mark. /// public ulong MaxSkipSequence { get { _lock.EnterReadLock(); try { if (_skipSequences.IsEmpty) return 0UL; // SequenceSet enumerates in ascending order; last element is max. ulong max = 0; foreach (var seq in _skipSequences) max = seq; // last wins since iteration is ascending return max; } finally { _lock.ExitReadLock(); } } } /// /// Exposes the set of soft-deleted sequence numbers for read-only inspection. /// Returns a snapshot as a so callers can use /// standard operations. /// Reference: golang/nats-server/server/filestore.go — dmap access for state queries. /// public IReadOnlySet DeletedSequences { get { _lock.EnterReadLock(); try { return _deleted.ToHashSet(); } finally { _lock.ExitReadLock(); } } } /// /// Enumerates all non-deleted sequences in this block along with their subjects. /// Used by FileStore for subject-filtered operations (PurgeEx, SubjectsState, etc.). /// Reference: golang/nats-server/server/filestore.go — loadBlock, iterating non-deleted records. /// public IEnumerable<(ulong Sequence, string Subject)> EnumerateNonDeleted() { // Snapshot index and deleted set under a write lock (may need to flush pending). List<(long Offset, int Length, ulong Seq)> entries; _lock.EnterWriteLock(); try { // Flush pending writes so disk reads see latest data. if (_pendingWrites.Count > 0) { foreach (var (data, off) in _pendingWrites) RandomAccess.Write(_handle, data, off); _pendingWrites.Clear(); _pendingBytes = 0; } entries = new List<(long, int, ulong)>(_index.Count); foreach (var (seq, (offset, length)) in _index) { if (!_deleted.Contains(seq)) entries.Add((offset, length, seq)); } } finally { _lock.ExitWriteLock(); } // Sort by sequence for deterministic output. entries.Sort((a, b) => a.Seq.CompareTo(b.Seq)); foreach (var (offset, length, seq) in entries) { // Check the write cache first to avoid disk I/O. _lock.EnterReadLock(); MessageRecord? cached = null; try { _cache?.TryGetValue(seq, out cached); } finally { _lock.ExitReadLock(); } if (cached is not null) { if (!cached.Deleted) yield return (cached.Sequence, cached.Subject); continue; } var buffer = new byte[length]; RandomAccess.Read(_handle, buffer, offset); var record = MessageRecord.Decode(buffer); if (record is not null && !record.Deleted) yield return (record.Sequence, record.Subject); } } /// /// Flushes any pending buffered writes and then syncs the file to disk. /// public void Flush() { _lock.EnterWriteLock(); try { // Flush pending buffered writes first. if (_pendingWrites.Count > 0) { foreach (var (data, offset) in _pendingWrites) RandomAccess.Write(_handle, data, offset); _pendingWrites.Clear(); _pendingBytes = 0; } _file.Flush(flushToDisk: true); } finally { _lock.ExitWriteLock(); } } /// /// Closes the file handle and releases resources. /// public void Dispose() { if (_disposed) return; _disposed = true; _lock.EnterWriteLock(); try { // Flush pending buffered writes before closing. if (_pendingWrites.Count > 0) { foreach (var (data, offset) in _pendingWrites) RandomAccess.Write(_handle, data, offset); _pendingWrites.Clear(); _pendingBytes = 0; } _file.Flush(); _file.Dispose(); } finally { _lock.ExitWriteLock(); } _lock.Dispose(); } /// /// Rebuilds the in-memory index by scanning all records in the block file. /// Uses to determine each record's /// size before decoding, so trailing data from subsequent records doesn't /// corrupt the checksum validation. /// private void RebuildIndex() { var fileLength = _file.Length; long offset = 0; ulong count = 0; while (offset < fileLength) { // Read remaining bytes from current offset using positional I/O var remaining = (int)(fileLength - offset); var buffer = new byte[remaining]; RandomAccess.Read(_handle, buffer, offset); // Measure the first record's length, then decode only that slice var recordLength = MessageRecord.MeasureRecord(buffer); var record = MessageRecord.Decode(buffer.AsSpan(0, recordLength)); _index[record.Sequence] = (offset, recordLength); if (record.Deleted) { _deleted.Add(record.Sequence); // Empty subject = skip/tombstone record (from SkipMsg/SkipMsgs). if (string.IsNullOrEmpty(record.Subject)) _skipSequences.Add(record.Sequence); } if (count == 0) _firstSequence = record.Sequence; _lastSequence = record.Sequence; _nextSequence = record.Sequence + 1; count++; offset += recordLength; } _totalWritten = count; _writeOffset = offset; // Note: recovered blocks do not populate the write cache — reads go to disk. // The cache is only populated during active writes on the hot path. } private static string BlockFilePath(string directoryPath, int blockId) => Path.Combine(directoryPath, $"{blockId:D6}.blk"); }