perf: optimize fan-out hot path and switch benchmarks to Release build
Round 9 optimizations targeting per-delivery overhead: - Switch benchmark harness from Debug to Release build (biggest impact: durable fetch 0.42x→0.92x, request-reply to parity) - Batch server-wide stats after fan-out loop (2 Interlocked per delivery → 2 per publish) - Guard auto-unsub tracking with MaxMessages > 0 (skip Interlocked in common case) - Cache SID as ASCII bytes on Subscription (avoid per-delivery encoding) - Pre-encode subject bytes once before fan-out loop (avoid N encodings) - Add 1-element subject string cache in ProcessPub (avoid repeated alloc) - Remove Interlocked from SubList.Match stats counters (approximate is fine) - Extract WriteMessageToBuffer helper for both string and span overloads
This commit is contained in:
@@ -702,6 +702,10 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
server.OnLocalUnsubscription(Account?.Name ?? Account.GlobalAccountName, sub.Subject, sub.Queue);
|
||||
}
|
||||
|
||||
// 1-element subject string cache: avoids allocating identical strings on repeated publishes.
|
||||
private string? _lastSubjectStr;
|
||||
private byte[]? _lastSubjectBytes;
|
||||
|
||||
private void ProcessPub(ParsedCommandView cmd, ref long localInMsgs, ref long localInBytes)
|
||||
{
|
||||
var payloadMemory = cmd.GetPayloadMemory();
|
||||
@@ -717,7 +721,19 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
return;
|
||||
}
|
||||
|
||||
var subject = Encoding.ASCII.GetString(cmd.Subject.Span);
|
||||
// 1-element cache: reuse string when publishing to the same subject repeatedly.
|
||||
var subjectSpan = cmd.Subject.Span;
|
||||
string subject;
|
||||
if (_lastSubjectBytes != null && subjectSpan.SequenceEqual(_lastSubjectBytes))
|
||||
{
|
||||
subject = _lastSubjectStr!;
|
||||
}
|
||||
else
|
||||
{
|
||||
subject = Encoding.ASCII.GetString(subjectSpan);
|
||||
_lastSubjectStr = subject;
|
||||
_lastSubjectBytes = subjectSpan.ToArray();
|
||||
}
|
||||
|
||||
// Pedantic mode: validate publish subject
|
||||
if (ClientOpts?.Pedantic == true && !SubjectMatch.IsValidPublishSubject(subject))
|
||||
@@ -786,12 +802,9 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
public void SendMessageNoFlush(string subject, string sid, string? replyTo,
|
||||
ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload)
|
||||
{
|
||||
// Batch per-client stats (single thread writes these during delivery).
|
||||
// Server-wide stats use Interlocked since multiple threads update them.
|
||||
// Per-client stats only — server-wide stats are batched at the ProcessMessage level.
|
||||
OutMsgs++;
|
||||
OutBytes += payload.Length + headers.Length;
|
||||
Interlocked.Increment(ref _serverStats.OutMsgs);
|
||||
Interlocked.Add(ref _serverStats.OutBytes, payload.Length + headers.Length);
|
||||
|
||||
// Format MSG header on the stack (no heap allocation).
|
||||
// Go reference: client.go msgHeader — formats into per-client 1KB scratch buffer (msgb).
|
||||
@@ -845,14 +858,22 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
headerBuf[pos++] = (byte)'\r';
|
||||
headerBuf[pos++] = (byte)'\n';
|
||||
|
||||
// Write header + body + CRLF directly into the per-client buffer under lock.
|
||||
// Go reference: client.go queueOutbound — appends slice refs under client.mu.
|
||||
var totalLen = pos + headers.Length + payload.Length + 2;
|
||||
WriteMessageToBuffer(headerBuf[..pos], headers, payload);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes the formatted MSG/HMSG header line, headers, payload, and trailing CRLF
|
||||
/// into the per-client direct buffer under lock.
|
||||
/// Go reference: client.go queueOutbound — appends slice refs under client.mu.
|
||||
/// </summary>
|
||||
private void WriteMessageToBuffer(ReadOnlySpan<byte> msgHeader,
|
||||
ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload)
|
||||
{
|
||||
var totalLen = msgHeader.Length + headers.Length + payload.Length + 2;
|
||||
var lockTaken = false;
|
||||
_directBufLock.Enter(ref lockTaken);
|
||||
try
|
||||
{
|
||||
// Grow buffer if needed
|
||||
var needed = _directBufUsed + totalLen;
|
||||
if (needed > _directBuf.Length)
|
||||
{
|
||||
@@ -864,12 +885,10 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
|
||||
var dst = _directBuf.AsSpan(_directBufUsed);
|
||||
|
||||
// Header
|
||||
headerBuf[..pos].CopyTo(dst);
|
||||
_directBufUsed += pos;
|
||||
msgHeader.CopyTo(dst);
|
||||
_directBufUsed += msgHeader.Length;
|
||||
dst = _directBuf.AsSpan(_directBufUsed);
|
||||
|
||||
// Headers (HMSG)
|
||||
if (headers.Length > 0)
|
||||
{
|
||||
headers.Span.CopyTo(dst);
|
||||
@@ -877,14 +896,12 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
dst = _directBuf.AsSpan(_directBufUsed);
|
||||
}
|
||||
|
||||
// Payload
|
||||
if (payload.Length > 0)
|
||||
{
|
||||
payload.Span.CopyTo(dst);
|
||||
_directBufUsed += payload.Length;
|
||||
}
|
||||
|
||||
// Trailing CRLF
|
||||
_directBuf[_directBufUsed++] = (byte)'\r';
|
||||
_directBuf[_directBufUsed++] = (byte)'\n';
|
||||
}
|
||||
@@ -907,6 +924,65 @@ public sealed class NatsClient : INatsClient, IDisposable
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fast-path overload accepting pre-encoded subject and SID bytes to avoid
|
||||
/// per-delivery ASCII encoding in fan-out scenarios.
|
||||
/// </summary>
|
||||
public void SendMessageNoFlush(ReadOnlySpan<byte> subjectBytes, ReadOnlySpan<byte> sidBytes, string? replyTo,
|
||||
ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload)
|
||||
{
|
||||
OutMsgs++;
|
||||
OutBytes += payload.Length + headers.Length;
|
||||
|
||||
Span<byte> headerBuf = stackalloc byte[512];
|
||||
int pos = 0;
|
||||
|
||||
if (headers.Length > 0)
|
||||
{
|
||||
"HMSG "u8.CopyTo(headerBuf);
|
||||
pos = 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
"MSG "u8.CopyTo(headerBuf);
|
||||
pos = 4;
|
||||
}
|
||||
|
||||
subjectBytes.CopyTo(headerBuf[pos..]);
|
||||
pos += subjectBytes.Length;
|
||||
headerBuf[pos++] = (byte)' ';
|
||||
|
||||
sidBytes.CopyTo(headerBuf[pos..]);
|
||||
pos += sidBytes.Length;
|
||||
headerBuf[pos++] = (byte)' ';
|
||||
|
||||
if (replyTo != null)
|
||||
{
|
||||
pos += Encoding.ASCII.GetBytes(replyTo, headerBuf[pos..]);
|
||||
headerBuf[pos++] = (byte)' ';
|
||||
}
|
||||
|
||||
if (headers.Length > 0)
|
||||
{
|
||||
int totalSize = headers.Length + payload.Length;
|
||||
headers.Length.TryFormat(headerBuf[pos..], out int written);
|
||||
pos += written;
|
||||
headerBuf[pos++] = (byte)' ';
|
||||
totalSize.TryFormat(headerBuf[pos..], out written);
|
||||
pos += written;
|
||||
}
|
||||
else
|
||||
{
|
||||
payload.Length.TryFormat(headerBuf[pos..], out int written);
|
||||
pos += written;
|
||||
}
|
||||
|
||||
headerBuf[pos++] = (byte)'\r';
|
||||
headerBuf[pos++] = (byte)'\n';
|
||||
|
||||
WriteMessageToBuffer(headerBuf[..pos], headers, payload);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signals the write loop to flush buffered data. Call once after batching
|
||||
/// multiple <see cref="SendMessageNoFlush"/> calls to the same client.
|
||||
|
||||
@@ -1437,6 +1437,10 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
var subList = sender.Account?.SubList ?? _globalAccount.SubList;
|
||||
var result = subList.Match(subject);
|
||||
var delivered = false;
|
||||
int deliveredCount = 0;
|
||||
|
||||
// Pre-encode subject bytes once for all fan-out deliveries (one alloc per publish, not per delivery).
|
||||
var subjectBytes = Encoding.ASCII.GetBytes(subject);
|
||||
|
||||
// Per-client deferred flush: collect unique clients during fan-out, signal each once.
|
||||
// Go reference: client.go:3905 addToPCD / client.go:1324 flushClients.
|
||||
@@ -1444,13 +1448,15 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
pcd.Clear();
|
||||
|
||||
// Deliver to plain subscribers
|
||||
var messageSize = payload.Length + headers.Length;
|
||||
foreach (var sub in result.PlainSubs)
|
||||
{
|
||||
if (sub.Client == null || sub.Client == sender && !(sender.ClientOpts?.Echo ?? true))
|
||||
continue;
|
||||
|
||||
DeliverMessage(sub, subject, replyTo, headers, payload, pcd);
|
||||
DeliverMessage(sub, subjectBytes, sub.SidBytes, subject, replyTo, headers, payload, pcd);
|
||||
delivered = true;
|
||||
deliveredCount++;
|
||||
}
|
||||
|
||||
// Deliver to one member of each queue group (round-robin)
|
||||
@@ -1470,8 +1476,9 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
var sub = queueGroup[(idx + attempt) % queueGroup.Length];
|
||||
if (sub.Client != null && (sub.Client != sender || (sender.ClientOpts?.Echo ?? true)))
|
||||
{
|
||||
DeliverMessage(sub, subject, replyTo, headers, payload, pcd);
|
||||
DeliverMessage(sub, subjectBytes, sub.SidBytes, subject, replyTo, headers, payload, pcd);
|
||||
delivered = true;
|
||||
deliveredCount++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1483,14 +1490,22 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
{
|
||||
if (sub.Client != null && sub.Client != sender)
|
||||
{
|
||||
DeliverMessage(sub, subject, replyTo, headers, payload, pcd);
|
||||
DeliverMessage(sub, subjectBytes, sub.SidBytes, subject, replyTo, headers, payload, pcd);
|
||||
delivered = true;
|
||||
deliveredCount++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Batch server-wide stats once per publish (instead of per-delivery Interlocked ops).
|
||||
if (deliveredCount > 0)
|
||||
{
|
||||
Interlocked.Add(ref _stats.OutMsgs, (long)deliveredCount);
|
||||
Interlocked.Add(ref _stats.OutBytes, (long)messageSize * deliveredCount);
|
||||
}
|
||||
|
||||
// Flush all unique clients once after fan-out.
|
||||
// Go reference: client.go:1324 flushClients — iterates pcd map, one signal per client.
|
||||
foreach (var client in pcd)
|
||||
@@ -1794,6 +1809,54 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
pcd.Clear();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fast-path overload using pre-encoded subject and SID bytes to avoid per-delivery encoding.
|
||||
/// Used by ProcessMessage fan-out loop.
|
||||
/// </summary>
|
||||
private void DeliverMessage(Subscription sub, ReadOnlySpan<byte> subjectBytes, ReadOnlySpan<byte> sidBytes,
|
||||
string subject, string? replyTo,
|
||||
ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload,
|
||||
HashSet<INatsClient>? pcd = null)
|
||||
{
|
||||
var client = sub.Client;
|
||||
if (client == null) return;
|
||||
|
||||
// Auto-unsub: only track when a limit is set (common case is MaxMessages == 0).
|
||||
if (sub.MaxMessages > 0)
|
||||
{
|
||||
var count = Interlocked.Increment(ref sub.MessageCount);
|
||||
if (count > sub.MaxMessages)
|
||||
{
|
||||
var subList = client.Account?.SubList ?? _globalAccount.SubList;
|
||||
subList.Remove(sub);
|
||||
client.RemoveSubscription(sub.Sid);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (client.Permissions?.IsDeliveryAllowed(subject) == false)
|
||||
return;
|
||||
|
||||
if (pcd != null)
|
||||
{
|
||||
if (client is NatsClient nc)
|
||||
nc.SendMessageNoFlush(subjectBytes, sidBytes, replyTo, headers, payload);
|
||||
else
|
||||
client.SendMessageNoFlush(subject, sub.Sid, replyTo, headers, payload);
|
||||
pcd.Add(client);
|
||||
}
|
||||
else
|
||||
{
|
||||
client.SendMessage(subject, sub.Sid, replyTo, headers, payload);
|
||||
}
|
||||
|
||||
if (replyTo != null && client.Permissions?.ResponseTracker != null)
|
||||
{
|
||||
if (client.Permissions.IsPublishAllowed(replyTo) == false)
|
||||
client.Permissions.ResponseTracker.RegisterReply(replyTo);
|
||||
}
|
||||
}
|
||||
|
||||
private void DeliverMessage(Subscription sub, string subject, string? replyTo,
|
||||
ReadOnlyMemory<byte> headers, ReadOnlyMemory<byte> payload,
|
||||
HashSet<INatsClient>? pcd = null)
|
||||
@@ -1801,24 +1864,22 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
var client = sub.Client;
|
||||
if (client == null) return;
|
||||
|
||||
// Check auto-unsub
|
||||
var count = Interlocked.Increment(ref sub.MessageCount);
|
||||
if (sub.MaxMessages > 0 && count > sub.MaxMessages)
|
||||
// Auto-unsub: only track when a limit is set (common case is MaxMessages == 0).
|
||||
if (sub.MaxMessages > 0)
|
||||
{
|
||||
// Clean up exhausted subscription from trie and client tracking
|
||||
var subList = client.Account?.SubList ?? _globalAccount.SubList;
|
||||
subList.Remove(sub);
|
||||
client.RemoveSubscription(sub.Sid);
|
||||
return;
|
||||
var count = Interlocked.Increment(ref sub.MessageCount);
|
||||
if (count > sub.MaxMessages)
|
||||
{
|
||||
var subList = client.Account?.SubList ?? _globalAccount.SubList;
|
||||
subList.Remove(sub);
|
||||
client.RemoveSubscription(sub.Sid);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Deny-list delivery filter
|
||||
if (client.Permissions?.IsDeliveryAllowed(subject) == false)
|
||||
return;
|
||||
|
||||
// When pcd (per-client deferred flush) set is provided, queue data without
|
||||
// signaling the write loop. The caller flushes all unique clients once after
|
||||
// the fan-out loop. Go reference: client.go addToPCD / flushClients.
|
||||
if (pcd != null)
|
||||
{
|
||||
client.SendMessageNoFlush(subject, sub.Sid, replyTo, headers, payload);
|
||||
@@ -1829,7 +1890,6 @@ public sealed class NatsServer : IMessageRouter, ISubListAccess, IDisposable
|
||||
client.SendMessage(subject, sub.Sid, replyTo, headers, payload);
|
||||
}
|
||||
|
||||
// Track reply subject for response permissions
|
||||
if (replyTo != null && client.Permissions?.ResponseTracker != null)
|
||||
{
|
||||
if (client.Permissions.IsPublishAllowed(replyTo) == false)
|
||||
|
||||
@@ -554,7 +554,7 @@ public sealed class SubList : IDisposable
|
||||
|
||||
public SubListResult Match(string subject)
|
||||
{
|
||||
Interlocked.Increment(ref _matches);
|
||||
_matches++;
|
||||
var currentGen = Interlocked.Read(ref _generation);
|
||||
|
||||
_lock.EnterReadLock();
|
||||
@@ -562,7 +562,7 @@ public sealed class SubList : IDisposable
|
||||
{
|
||||
if (_cache != null && _cache.TryGetValue(subject, out var cached) && cached.Generation == currentGen)
|
||||
{
|
||||
Interlocked.Increment(ref _cacheHits);
|
||||
_cacheHits++;
|
||||
return cached.Result;
|
||||
}
|
||||
}
|
||||
@@ -581,7 +581,7 @@ public sealed class SubList : IDisposable
|
||||
currentGen = Interlocked.Read(ref _generation);
|
||||
if (_cache != null && _cache.TryGetValue(subject, out var cached) && cached.Generation == currentGen)
|
||||
{
|
||||
Interlocked.Increment(ref _cacheHits);
|
||||
_cacheHits++;
|
||||
return cached.Result;
|
||||
}
|
||||
|
||||
@@ -940,8 +940,8 @@ public sealed class SubList : IDisposable
|
||||
_lock.ExitReadLock();
|
||||
}
|
||||
|
||||
var matches = Interlocked.Read(ref _matches);
|
||||
var cacheHits = Interlocked.Read(ref _cacheHits);
|
||||
var matches = Volatile.Read(ref _matches);
|
||||
var cacheHits = Volatile.Read(ref _cacheHits);
|
||||
var hitRate = matches > 0 ? (double)cacheHits / matches : 0.0;
|
||||
|
||||
uint maxFanout = 0;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using System.Text;
|
||||
using NATS.Server;
|
||||
using NATS.Server.Imports;
|
||||
|
||||
@@ -5,9 +6,17 @@ namespace NATS.Server.Subscriptions;
|
||||
|
||||
public sealed class Subscription
|
||||
{
|
||||
private byte[]? _sidBytes;
|
||||
|
||||
public required string Subject { get; init; }
|
||||
public string? Queue { get; init; }
|
||||
public required string Sid { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Pre-encoded ASCII bytes of the SID, cached to avoid per-delivery encoding.
|
||||
/// </summary>
|
||||
public byte[] SidBytes => _sidBytes ??= Encoding.ASCII.GetBytes(Sid);
|
||||
|
||||
public long MessageCount; // Interlocked
|
||||
public long MaxMessages; // 0 = unlimited
|
||||
public INatsClient? Client { get; set; }
|
||||
|
||||
Reference in New Issue
Block a user