257 lines
11 KiB
C#
257 lines
11 KiB
C#
namespace NATS.Server.JetStream.Cluster;
|
|
|
|
/// <summary>
|
|
/// Topology-aware peer selection for stream/consumer replica placement.
|
|
/// Go reference: jetstream_cluster.go:7212 selectPeerGroup.
|
|
/// </summary>
|
|
public static class PlacementEngine
|
|
{
|
|
/// <summary>
|
|
/// Default cost in bytes subtracted from AvailableStorage per assigned asset
|
|
/// when computing weighted placement scores.
|
|
/// Go reference: jetstream_cluster.go:7469 sort by avail then ns (number of streams).
|
|
/// </summary>
|
|
public const long DefaultAssetCostWeight = 1_073_741_824L; // 1 GiB
|
|
|
|
/// <summary>
|
|
/// Selects peers for a new replica group based on available nodes, tags, cluster
|
|
/// affinity, HA asset limits, and weighted scoring.
|
|
///
|
|
/// Selection pipeline:
|
|
/// 1. Filter out unavailable peers.
|
|
/// 2. Apply cluster affinity filter.
|
|
/// 3. Apply required tag filter.
|
|
/// 4. Apply exclude-tag filter.
|
|
/// 5. Separate overloaded peers (CurrentAssets >= MaxAssetsPerPeer when MaxAssetsPerPeer > 0)
|
|
/// from preferred candidates.
|
|
/// 6. Within each candidate group, sort by weighted score descending:
|
|
/// score = AvailableStorage - (CurrentAssets * assetCostWeight)
|
|
/// 7. Apply UniqueTag constraint: greedily select from scored list, skipping any
|
|
/// peer whose tag-value for the UniqueTag prefix is already represented.
|
|
/// Overloaded peers are tried only after preferred candidates are exhausted.
|
|
/// 8. Throw InvalidOperationException if fewer than replicas peers can be selected.
|
|
/// </summary>
|
|
/// <param name="groupName">RAFT group name being placed.</param>
|
|
/// <param name="replicas">Required number of replicas/peers.</param>
|
|
/// <param name="availablePeers">Available cluster peers considered for placement.</param>
|
|
/// <param name="policy">Optional placement policy with cluster/tag constraints.</param>
|
|
/// <param name="assetCostWeight">Per-asset storage penalty used in scoring.</param>
|
|
public static RaftGroup SelectPeerGroup(
|
|
string groupName,
|
|
int replicas,
|
|
IReadOnlyList<PeerInfo> availablePeers,
|
|
PlacementPolicy? policy = null,
|
|
long assetCostWeight = DefaultAssetCostWeight)
|
|
{
|
|
// 1. Filter out unavailable peers.
|
|
IEnumerable<PeerInfo> candidates = availablePeers.Where(p => p.Available);
|
|
|
|
// 2. If policy has Cluster, filter to matching cluster.
|
|
if (policy?.Cluster is { Length: > 0 } cluster)
|
|
candidates = candidates.Where(p => string.Equals(p.Cluster, cluster, StringComparison.OrdinalIgnoreCase));
|
|
|
|
// 3. If policy has Tags, filter to peers that have ALL required tags.
|
|
if (policy?.Tags is { Count: > 0 } requiredTags)
|
|
candidates = candidates.Where(p => requiredTags.All(tag => p.Tags.Contains(tag)));
|
|
|
|
// 4. If policy has ExcludeTags, filter out peers with any of those tags.
|
|
if (policy?.ExcludeTags is { Count: > 0 } excludeTags)
|
|
candidates = candidates.Where(p => !excludeTags.Any(tag => p.Tags.Contains(tag)));
|
|
|
|
var filtered = candidates.ToList();
|
|
|
|
// 5. Separate preferred candidates from overloaded (HA-limited) ones.
|
|
// Overloaded peers are deprioritized but NOT excluded entirely — they serve
|
|
// as a fallback when no preferred options remain.
|
|
// Go reference: jetstream_cluster.go:7428 maxHaAssets check (deprioritize).
|
|
var (preferred, overloaded) = SplitByHaLimit(filtered);
|
|
|
|
// 6. Sort each group by weighted score descending.
|
|
// score = AvailableStorage - (CurrentAssets * assetCostWeight)
|
|
// Go reference: jetstream_cluster.go:7469 sort by avail then ns.
|
|
var sortedPreferred = SortByScore(preferred, assetCostWeight);
|
|
var sortedOverloaded = SortByScore(overloaded, assetCostWeight);
|
|
|
|
// 7. Apply UniqueTag constraint (if set) via greedy selection over the
|
|
// combined ordered list: preferred first, overloaded as fallback.
|
|
// Go reference: jetstream_cluster.go:7251 uniqueTagPrefix / checkUniqueTag.
|
|
string? uniqueTagPrefix = policy?.UniqueTag is { Length: > 0 } ut ? ut : null;
|
|
|
|
List<string> selected;
|
|
if (uniqueTagPrefix is not null)
|
|
{
|
|
selected = SelectWithUniqueTag(
|
|
sortedPreferred, sortedOverloaded, replicas, uniqueTagPrefix);
|
|
}
|
|
else
|
|
{
|
|
// No unique-tag constraint — just take the top N from the combined list.
|
|
var all = sortedPreferred.Concat(sortedOverloaded).ToList();
|
|
if (all.Count < replicas)
|
|
ThrowInsufficientPeers(replicas, all.Count);
|
|
|
|
selected = all.Take(replicas).Select(p => p.PeerId).ToList();
|
|
}
|
|
|
|
// 8. Return RaftGroup with selected peer IDs.
|
|
return new RaftGroup
|
|
{
|
|
Name = groupName,
|
|
Peers = selected,
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------
|
|
// Private helpers
|
|
// ---------------------------------------------------------------
|
|
|
|
/// <summary>
|
|
/// Splits candidates into (preferred, overloaded) where overloaded peers have
|
|
/// CurrentAssets >= MaxAssetsPerPeer (when MaxAssetsPerPeer > 0).
|
|
/// </summary>
|
|
private static (List<PeerInfo> preferred, List<PeerInfo> overloaded) SplitByHaLimit(
|
|
List<PeerInfo> peers)
|
|
{
|
|
var preferred = new List<PeerInfo>(peers.Count);
|
|
var overloaded = new List<PeerInfo>();
|
|
|
|
foreach (var p in peers)
|
|
{
|
|
bool isOverloaded = p.MaxAssetsPerPeer > 0 && p.CurrentAssets >= p.MaxAssetsPerPeer;
|
|
if (isOverloaded)
|
|
overloaded.Add(p);
|
|
else
|
|
preferred.Add(p);
|
|
}
|
|
|
|
return (preferred, overloaded);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sorts peers by weighted score descending.
|
|
/// score = AvailableStorage - (CurrentAssets * assetCostWeight)
|
|
/// </summary>
|
|
private static List<PeerInfo> SortByScore(List<PeerInfo> peers, long assetCostWeight) =>
|
|
[.. peers.OrderByDescending(p => p.AvailableStorage - (p.CurrentAssets * assetCostWeight))];
|
|
|
|
/// <summary>
|
|
/// Greedy unique-tag selection.
|
|
/// Iterates over preferred peers first, then overloaded peers as fallback.
|
|
/// For each candidate, the peer is accepted only if no previously selected peer
|
|
/// shares the same value for the tag that starts with <paramref name="uniqueTagPrefix"/>.
|
|
/// A peer without any matching tag is rejected (same as Go behaviour: the prefix
|
|
/// must be present).
|
|
/// Go reference: jetstream_cluster.go:7263 checkUniqueTag.
|
|
/// </summary>
|
|
private static List<string> SelectWithUniqueTag(
|
|
IEnumerable<PeerInfo> preferred,
|
|
IEnumerable<PeerInfo> overloaded,
|
|
int replicas,
|
|
string uniqueTagPrefix)
|
|
{
|
|
var seenTagValues = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var selected = new List<string>(replicas);
|
|
|
|
TrySelect(preferred);
|
|
if (selected.Count < replicas)
|
|
TrySelect(overloaded);
|
|
|
|
if (selected.Count < replicas)
|
|
ThrowInsufficientPeers(replicas, selected.Count);
|
|
|
|
return selected;
|
|
|
|
void TrySelect(IEnumerable<PeerInfo> source)
|
|
{
|
|
foreach (var peer in source)
|
|
{
|
|
if (selected.Count >= replicas)
|
|
break;
|
|
|
|
// Find the first tag on this peer that starts with the unique prefix.
|
|
// Go reference: jetstream_cluster.go:7264 strings.HasPrefix(t, uniqueTagPrefix)
|
|
string? tagValue = null;
|
|
foreach (var tag in peer.Tags)
|
|
{
|
|
if (tag.StartsWith(uniqueTagPrefix, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
tagValue = tag;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Peer has no matching tag → reject (same as Go: "unique prefix not present").
|
|
if (tagValue is null)
|
|
continue;
|
|
|
|
// Tag value already claimed by a previously selected peer → reject.
|
|
if (!seenTagValues.Add(tagValue))
|
|
continue;
|
|
|
|
selected.Add(peer.PeerId);
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void ThrowInsufficientPeers(int required, int available) =>
|
|
throw new InvalidOperationException(
|
|
$"Not enough peers available to satisfy replica count {required}. " +
|
|
$"Available after policy filtering: {available}.");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Describes a peer node available for placement consideration.
|
|
/// Go reference: jetstream_cluster.go peerInfo — peer.id, peer.offline, peer.storage,
|
|
/// peer HAAssets, peer streams count.
|
|
/// </summary>
|
|
public sealed class PeerInfo
|
|
{
|
|
/// <summary>Unique peer identifier used in RAFT group membership.</summary>
|
|
public required string PeerId { get; init; }
|
|
/// <summary>Cluster name/partition where this peer resides.</summary>
|
|
public string Cluster { get; set; } = string.Empty;
|
|
/// <summary>Capability and topology tags advertised by this peer.</summary>
|
|
public HashSet<string> Tags { get; init; } = new(StringComparer.OrdinalIgnoreCase);
|
|
/// <summary>Whether this peer is currently eligible for new assignments.</summary>
|
|
public bool Available { get; set; } = true;
|
|
/// <summary>Approximate remaining storage available for new assets.</summary>
|
|
public long AvailableStorage { get; set; } = long.MaxValue;
|
|
|
|
/// <summary>
|
|
/// Number of assets (streams/consumers) currently assigned to this peer.
|
|
/// Go reference: jetstream_cluster.go:7311 peerStreams / peerHA maps.
|
|
/// </summary>
|
|
public int CurrentAssets { get; set; }
|
|
|
|
/// <summary>
|
|
/// Maximum HA assets this peer should host before being deprioritized.
|
|
/// 0 means unlimited (no deprioritization).
|
|
/// Go reference: jetstream_cluster.go:7328 maxHaAssets.
|
|
/// </summary>
|
|
public int MaxAssetsPerPeer { get; set; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Placement policy specifying cluster affinity, tag constraints, and unique-tag enforcement.
|
|
/// Go reference: jetstream_cluster.go Placement struct — cluster, tags;
|
|
/// server opts JetStreamUniqueTag.
|
|
/// </summary>
|
|
public sealed class PlacementPolicy
|
|
{
|
|
/// <summary>Optional cluster affinity constraint.</summary>
|
|
public string? Cluster { get; set; }
|
|
/// <summary>Required tags that must all be present on a candidate peer.</summary>
|
|
public HashSet<string>? Tags { get; set; }
|
|
/// <summary>Tags that disqualify a candidate peer when present.</summary>
|
|
public HashSet<string>? ExcludeTags { get; set; }
|
|
|
|
/// <summary>
|
|
/// Tag prefix used to enforce AZ/rack uniqueness across replicas.
|
|
/// When set, no two replicas may share the same value for a tag that starts
|
|
/// with this prefix (e.g., "az" matches "az:us-east-1a").
|
|
/// Null or empty string disables the constraint.
|
|
/// Go reference: jetstream_cluster.go:7251 JetStreamUniqueTag / uniqueTagPrefix.
|
|
/// </summary>
|
|
public string? UniqueTag { get; set; }
|
|
}
|