perf: add FileStore buffered writes, O(1) state tracking, and eliminate redundant per-publish work

Implement Go-parity background flush loop (coalesce 16KB/8ms) in MsgBlock/FileStore,
replace O(n) GetStateAsync with incremental counters, skip PruneExpired/LoadAsync/
PrunePerSubject when not needed, and bypass RAFT for single-replica streams. Fix counter
tracking bugs in RemoveMsg/EraseMsg/TTL expiry and ObjectDisposedException races in
flush loop disposal. FileStore optimizations verified with 3112/3112 JetStream tests
passing; async publish benchmark remains at ~174 msg/s due to E2E protocol path bottleneck.
This commit is contained in:
Joseph Doherty
2026-03-13 03:11:11 -04:00
parent 37575dc41c
commit 4de691c9c5
30 changed files with 1514 additions and 185 deletions

View File

@@ -48,6 +48,12 @@ public sealed class MsgBlock : IDisposable
// Reference: golang/nats-server/server/filestore.go:236 (cache field)
private Dictionary<ulong, MessageRecord>? _cache;
// Pending write buffer — accumulates encoded records for batched disk writes.
// The background flush loop in FileStore coalesces these into fewer I/O calls.
// Reference: golang/nats-server/server/filestore.go:6700 (cache.buf write path).
private readonly List<(byte[] Data, long Offset)> _pendingWrites = new();
private int _pendingBytes;
// Go: msgBlock.lchk — last written record checksum (XxHash64, 8 bytes).
// Tracked so callers can chain checksum verification across blocks.
// Reference: golang/nats-server/server/filestore.go:2204 (lchk field)
@@ -147,6 +153,23 @@ public sealed class MsgBlock : IDisposable
}
}
/// <summary>
/// Total bytes of pending (not yet flushed to disk) writes in this block.
/// Used by the background flush loop to decide when to coalesce.
/// </summary>
public int PendingWriteSize
{
get
{
if (_disposed)
return 0;
try { _lock.EnterReadLock(); }
catch (ObjectDisposedException) { return 0; }
try { return _pendingBytes; }
finally { _lock.ExitReadLock(); }
}
}
/// <summary>
/// The XxHash64 checksum of the last record written to this block (8 bytes), or null
/// if no records have been written yet. Updated after every <see cref="Write"/>,
@@ -230,8 +253,10 @@ public sealed class MsgBlock : IDisposable
var encoded = MessageRecord.Encode(record);
var offset = _writeOffset;
// Write at the current append offset using positional I/O
RandomAccess.Write(_handle, encoded, offset);
// Buffer the write for batched disk I/O — the background flush loop
// in FileStore will coalesce pending writes.
_pendingWrites.Add((encoded, offset));
_pendingBytes += encoded.Length;
_writeOffset = offset + encoded.Length;
_index[sequence] = (offset, encoded.Length);
@@ -295,7 +320,10 @@ public sealed class MsgBlock : IDisposable
var encoded = MessageRecord.Encode(record);
var offset = _writeOffset;
RandomAccess.Write(_handle, encoded, offset);
// Buffer the write for batched disk I/O — the background flush loop
// in FileStore will coalesce pending writes.
_pendingWrites.Add((encoded, offset));
_pendingBytes += encoded.Length;
_writeOffset = offset + encoded.Length;
_index[sequence] = (offset, encoded.Length);
@@ -333,7 +361,8 @@ public sealed class MsgBlock : IDisposable
/// <returns>The decoded record, or null if not found or deleted.</returns>
public MessageRecord? Read(ulong sequence)
{
_lock.EnterReadLock();
// Use a write lock because we may need to flush pending writes.
_lock.EnterWriteLock();
try
{
if (_deleted.Contains(sequence))
@@ -347,6 +376,15 @@ public sealed class MsgBlock : IDisposable
if (!_index.TryGetValue(sequence, out var entry))
return null;
// Flush pending writes so disk reads see the latest data.
if (_pendingWrites.Count > 0)
{
foreach (var (data, off) in _pendingWrites)
RandomAccess.Write(_handle, data, off);
_pendingWrites.Clear();
_pendingBytes = 0;
}
var buffer = new byte[entry.Length];
RandomAccess.Read(_handle, buffer, entry.Offset);
@@ -354,7 +392,7 @@ public sealed class MsgBlock : IDisposable
}
finally
{
_lock.ExitReadLock();
_lock.ExitWriteLock();
}
}
@@ -384,6 +422,15 @@ public sealed class MsgBlock : IDisposable
if (!_deleted.Add(sequence))
return false;
// Flush any pending writes so the record is on disk before we read it back.
if (_pendingWrites.Count > 0)
{
foreach (var (data, off) in _pendingWrites)
RandomAccess.Write(_handle, data, off);
_pendingWrites.Clear();
_pendingBytes = 0;
}
// Read the existing record, re-encode with Deleted flag, write back in-place.
// The encoded size doesn't change (only flags byte + checksum differ).
var buffer = new byte[entry.Length];
@@ -455,7 +502,9 @@ public sealed class MsgBlock : IDisposable
var encoded = MessageRecord.Encode(record);
var offset = _writeOffset;
RandomAccess.Write(_handle, encoded, offset);
// Buffer the write for batched disk I/O.
_pendingWrites.Add((encoded, offset));
_pendingBytes += encoded.Length;
_writeOffset = offset + encoded.Length;
_index[sequence] = (offset, encoded.Length);
@@ -500,6 +549,44 @@ public sealed class MsgBlock : IDisposable
}
}
/// <summary>
/// Flushes all buffered (pending) writes to disk in a single batch.
/// Called by the background flush loop in FileStore, or synchronously on
/// block seal / dispose to ensure all data reaches disk.
/// Reference: golang/nats-server/server/filestore.go:7592 (flushPendingMsgsLocked).
/// </summary>
/// <returns>The number of bytes flushed.</returns>
public int FlushPending()
{
if (_disposed)
return 0;
try
{
_lock.EnterWriteLock();
}
catch (ObjectDisposedException)
{
// Block was disposed concurrently (e.g. during PurgeAsync).
return 0;
}
try
{
if (_pendingWrites.Count == 0)
return 0;
foreach (var (data, offset) in _pendingWrites)
RandomAccess.Write(_handle, data, offset);
var flushed = _pendingBytes;
_pendingWrites.Clear();
_pendingBytes = 0;
return flushed;
}
finally { _lock.ExitWriteLock(); }
}
/// <summary>
/// Returns true if the given sequence number has been soft-deleted in this block.
/// Reference: golang/nats-server/server/filestore.go — dmap (deleted map) lookup.
@@ -559,11 +646,20 @@ public sealed class MsgBlock : IDisposable
/// </summary>
public IEnumerable<(ulong Sequence, string Subject)> EnumerateNonDeleted()
{
// Snapshot index and deleted set under the read lock, then decode outside it.
// Snapshot index and deleted set under a write lock (may need to flush pending).
List<(long Offset, int Length, ulong Seq)> entries;
_lock.EnterReadLock();
_lock.EnterWriteLock();
try
{
// Flush pending writes so disk reads see latest data.
if (_pendingWrites.Count > 0)
{
foreach (var (data, off) in _pendingWrites)
RandomAccess.Write(_handle, data, off);
_pendingWrites.Clear();
_pendingBytes = 0;
}
entries = new List<(long, int, ulong)>(_index.Count);
foreach (var (seq, (offset, length)) in _index)
{
@@ -573,7 +669,7 @@ public sealed class MsgBlock : IDisposable
}
finally
{
_lock.ExitReadLock();
_lock.ExitWriteLock();
}
// Sort by sequence for deterministic output.
@@ -609,13 +705,22 @@ public sealed class MsgBlock : IDisposable
}
/// <summary>
/// Flushes any buffered writes to disk.
/// Flushes any pending buffered writes and then syncs the file to disk.
/// </summary>
public void Flush()
{
_lock.EnterWriteLock();
try
{
// Flush pending buffered writes first.
if (_pendingWrites.Count > 0)
{
foreach (var (data, offset) in _pendingWrites)
RandomAccess.Write(_handle, data, offset);
_pendingWrites.Clear();
_pendingBytes = 0;
}
_file.Flush(flushToDisk: true);
}
finally
@@ -636,6 +741,15 @@ public sealed class MsgBlock : IDisposable
_lock.EnterWriteLock();
try
{
// Flush pending buffered writes before closing.
if (_pendingWrites.Count > 0)
{
foreach (var (data, offset) in _pendingWrites)
RandomAccess.Write(_handle, data, offset);
_pendingWrites.Clear();
_pendingBytes = 0;
}
_file.Flush();
_file.Dispose();
}