Files
natsdotnet/src/NATS.Server/JetStream/Cluster/PlacementEngine.cs

257 lines
11 KiB
C#

namespace NATS.Server.JetStream.Cluster;
/// <summary>
/// Topology-aware peer selection for stream/consumer replica placement.
/// Go reference: jetstream_cluster.go:7212 selectPeerGroup.
/// </summary>
public static class PlacementEngine
{
/// <summary>
/// Default cost in bytes subtracted from AvailableStorage per assigned asset
/// when computing weighted placement scores.
/// Go reference: jetstream_cluster.go:7469 sort by avail then ns (number of streams).
/// </summary>
public const long DefaultAssetCostWeight = 1_073_741_824L; // 1 GiB
/// <summary>
/// Selects peers for a new replica group based on available nodes, tags, cluster
/// affinity, HA asset limits, and weighted scoring.
///
/// Selection pipeline:
/// 1. Filter out unavailable peers.
/// 2. Apply cluster affinity filter.
/// 3. Apply required tag filter.
/// 4. Apply exclude-tag filter.
/// 5. Separate overloaded peers (CurrentAssets >= MaxAssetsPerPeer when MaxAssetsPerPeer > 0)
/// from preferred candidates.
/// 6. Within each candidate group, sort by weighted score descending:
/// score = AvailableStorage - (CurrentAssets * assetCostWeight)
/// 7. Apply UniqueTag constraint: greedily select from scored list, skipping any
/// peer whose tag-value for the UniqueTag prefix is already represented.
/// Overloaded peers are tried only after preferred candidates are exhausted.
/// 8. Throw InvalidOperationException if fewer than replicas peers can be selected.
/// </summary>
/// <param name="groupName">RAFT group name being placed.</param>
/// <param name="replicas">Required number of replicas/peers.</param>
/// <param name="availablePeers">Available cluster peers considered for placement.</param>
/// <param name="policy">Optional placement policy with cluster/tag constraints.</param>
/// <param name="assetCostWeight">Per-asset storage penalty used in scoring.</param>
public static RaftGroup SelectPeerGroup(
string groupName,
int replicas,
IReadOnlyList<PeerInfo> availablePeers,
PlacementPolicy? policy = null,
long assetCostWeight = DefaultAssetCostWeight)
{
// 1. Filter out unavailable peers.
IEnumerable<PeerInfo> candidates = availablePeers.Where(p => p.Available);
// 2. If policy has Cluster, filter to matching cluster.
if (policy?.Cluster is { Length: > 0 } cluster)
candidates = candidates.Where(p => string.Equals(p.Cluster, cluster, StringComparison.OrdinalIgnoreCase));
// 3. If policy has Tags, filter to peers that have ALL required tags.
if (policy?.Tags is { Count: > 0 } requiredTags)
candidates = candidates.Where(p => requiredTags.All(tag => p.Tags.Contains(tag)));
// 4. If policy has ExcludeTags, filter out peers with any of those tags.
if (policy?.ExcludeTags is { Count: > 0 } excludeTags)
candidates = candidates.Where(p => !excludeTags.Any(tag => p.Tags.Contains(tag)));
var filtered = candidates.ToList();
// 5. Separate preferred candidates from overloaded (HA-limited) ones.
// Overloaded peers are deprioritized but NOT excluded entirely — they serve
// as a fallback when no preferred options remain.
// Go reference: jetstream_cluster.go:7428 maxHaAssets check (deprioritize).
var (preferred, overloaded) = SplitByHaLimit(filtered);
// 6. Sort each group by weighted score descending.
// score = AvailableStorage - (CurrentAssets * assetCostWeight)
// Go reference: jetstream_cluster.go:7469 sort by avail then ns.
var sortedPreferred = SortByScore(preferred, assetCostWeight);
var sortedOverloaded = SortByScore(overloaded, assetCostWeight);
// 7. Apply UniqueTag constraint (if set) via greedy selection over the
// combined ordered list: preferred first, overloaded as fallback.
// Go reference: jetstream_cluster.go:7251 uniqueTagPrefix / checkUniqueTag.
string? uniqueTagPrefix = policy?.UniqueTag is { Length: > 0 } ut ? ut : null;
List<string> selected;
if (uniqueTagPrefix is not null)
{
selected = SelectWithUniqueTag(
sortedPreferred, sortedOverloaded, replicas, uniqueTagPrefix);
}
else
{
// No unique-tag constraint — just take the top N from the combined list.
var all = sortedPreferred.Concat(sortedOverloaded).ToList();
if (all.Count < replicas)
ThrowInsufficientPeers(replicas, all.Count);
selected = all.Take(replicas).Select(p => p.PeerId).ToList();
}
// 8. Return RaftGroup with selected peer IDs.
return new RaftGroup
{
Name = groupName,
Peers = selected,
};
}
// ---------------------------------------------------------------
// Private helpers
// ---------------------------------------------------------------
/// <summary>
/// Splits candidates into (preferred, overloaded) where overloaded peers have
/// CurrentAssets >= MaxAssetsPerPeer (when MaxAssetsPerPeer > 0).
/// </summary>
private static (List<PeerInfo> preferred, List<PeerInfo> overloaded) SplitByHaLimit(
List<PeerInfo> peers)
{
var preferred = new List<PeerInfo>(peers.Count);
var overloaded = new List<PeerInfo>();
foreach (var p in peers)
{
bool isOverloaded = p.MaxAssetsPerPeer > 0 && p.CurrentAssets >= p.MaxAssetsPerPeer;
if (isOverloaded)
overloaded.Add(p);
else
preferred.Add(p);
}
return (preferred, overloaded);
}
/// <summary>
/// Sorts peers by weighted score descending.
/// score = AvailableStorage - (CurrentAssets * assetCostWeight)
/// </summary>
private static List<PeerInfo> SortByScore(List<PeerInfo> peers, long assetCostWeight) =>
[.. peers.OrderByDescending(p => p.AvailableStorage - (p.CurrentAssets * assetCostWeight))];
/// <summary>
/// Greedy unique-tag selection.
/// Iterates over preferred peers first, then overloaded peers as fallback.
/// For each candidate, the peer is accepted only if no previously selected peer
/// shares the same value for the tag that starts with <paramref name="uniqueTagPrefix"/>.
/// A peer without any matching tag is rejected (same as Go behaviour: the prefix
/// must be present).
/// Go reference: jetstream_cluster.go:7263 checkUniqueTag.
/// </summary>
private static List<string> SelectWithUniqueTag(
IEnumerable<PeerInfo> preferred,
IEnumerable<PeerInfo> overloaded,
int replicas,
string uniqueTagPrefix)
{
var seenTagValues = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var selected = new List<string>(replicas);
TrySelect(preferred);
if (selected.Count < replicas)
TrySelect(overloaded);
if (selected.Count < replicas)
ThrowInsufficientPeers(replicas, selected.Count);
return selected;
void TrySelect(IEnumerable<PeerInfo> source)
{
foreach (var peer in source)
{
if (selected.Count >= replicas)
break;
// Find the first tag on this peer that starts with the unique prefix.
// Go reference: jetstream_cluster.go:7264 strings.HasPrefix(t, uniqueTagPrefix)
string? tagValue = null;
foreach (var tag in peer.Tags)
{
if (tag.StartsWith(uniqueTagPrefix, StringComparison.OrdinalIgnoreCase))
{
tagValue = tag;
break;
}
}
// Peer has no matching tag → reject (same as Go: "unique prefix not present").
if (tagValue is null)
continue;
// Tag value already claimed by a previously selected peer → reject.
if (!seenTagValues.Add(tagValue))
continue;
selected.Add(peer.PeerId);
}
}
}
private static void ThrowInsufficientPeers(int required, int available) =>
throw new InvalidOperationException(
$"Not enough peers available to satisfy replica count {required}. " +
$"Available after policy filtering: {available}.");
}
/// <summary>
/// Describes a peer node available for placement consideration.
/// Go reference: jetstream_cluster.go peerInfo — peer.id, peer.offline, peer.storage,
/// peer HAAssets, peer streams count.
/// </summary>
public sealed class PeerInfo
{
/// <summary>Unique peer identifier used in RAFT group membership.</summary>
public required string PeerId { get; init; }
/// <summary>Cluster name/partition where this peer resides.</summary>
public string Cluster { get; set; } = string.Empty;
/// <summary>Capability and topology tags advertised by this peer.</summary>
public HashSet<string> Tags { get; init; } = new(StringComparer.OrdinalIgnoreCase);
/// <summary>Whether this peer is currently eligible for new assignments.</summary>
public bool Available { get; set; } = true;
/// <summary>Approximate remaining storage available for new assets.</summary>
public long AvailableStorage { get; set; } = long.MaxValue;
/// <summary>
/// Number of assets (streams/consumers) currently assigned to this peer.
/// Go reference: jetstream_cluster.go:7311 peerStreams / peerHA maps.
/// </summary>
public int CurrentAssets { get; set; }
/// <summary>
/// Maximum HA assets this peer should host before being deprioritized.
/// 0 means unlimited (no deprioritization).
/// Go reference: jetstream_cluster.go:7328 maxHaAssets.
/// </summary>
public int MaxAssetsPerPeer { get; set; }
}
/// <summary>
/// Placement policy specifying cluster affinity, tag constraints, and unique-tag enforcement.
/// Go reference: jetstream_cluster.go Placement struct — cluster, tags;
/// server opts JetStreamUniqueTag.
/// </summary>
public sealed class PlacementPolicy
{
/// <summary>Optional cluster affinity constraint.</summary>
public string? Cluster { get; set; }
/// <summary>Required tags that must all be present on a candidate peer.</summary>
public HashSet<string>? Tags { get; set; }
/// <summary>Tags that disqualify a candidate peer when present.</summary>
public HashSet<string>? ExcludeTags { get; set; }
/// <summary>
/// Tag prefix used to enforce AZ/rack uniqueness across replicas.
/// When set, no two replicas may share the same value for a tag that starts
/// with this prefix (e.g., "az" matches "az:us-east-1a").
/// Null or empty string disables the constraint.
/// Go reference: jetstream_cluster.go:7251 JetStreamUniqueTag / uniqueTagPrefix.
/// </summary>
public string? UniqueTag { get; set; }
}