namespace NATS.Server.JetStream.Cluster; /// /// Topology-aware peer selection for stream/consumer replica placement. /// Go reference: jetstream_cluster.go:7212 selectPeerGroup. /// public static class PlacementEngine { /// /// Default cost in bytes subtracted from AvailableStorage per assigned asset /// when computing weighted placement scores. /// Go reference: jetstream_cluster.go:7469 sort by avail then ns (number of streams). /// public const long DefaultAssetCostWeight = 1_073_741_824L; // 1 GiB /// /// Selects peers for a new replica group based on available nodes, tags, cluster /// affinity, HA asset limits, and weighted scoring. /// /// Selection pipeline: /// 1. Filter out unavailable peers. /// 2. Apply cluster affinity filter. /// 3. Apply required tag filter. /// 4. Apply exclude-tag filter. /// 5. Separate overloaded peers (CurrentAssets >= MaxAssetsPerPeer when MaxAssetsPerPeer > 0) /// from preferred candidates. /// 6. Within each candidate group, sort by weighted score descending: /// score = AvailableStorage - (CurrentAssets * assetCostWeight) /// 7. Apply UniqueTag constraint: greedily select from scored list, skipping any /// peer whose tag-value for the UniqueTag prefix is already represented. /// Overloaded peers are tried only after preferred candidates are exhausted. /// 8. Throw InvalidOperationException if fewer than replicas peers can be selected. /// /// RAFT group name being placed. /// Required number of replicas/peers. /// Available cluster peers considered for placement. /// Optional placement policy with cluster/tag constraints. /// Per-asset storage penalty used in scoring. public static RaftGroup SelectPeerGroup( string groupName, int replicas, IReadOnlyList availablePeers, PlacementPolicy? policy = null, long assetCostWeight = DefaultAssetCostWeight) { // 1. Filter out unavailable peers. IEnumerable candidates = availablePeers.Where(p => p.Available); // 2. If policy has Cluster, filter to matching cluster. if (policy?.Cluster is { Length: > 0 } cluster) candidates = candidates.Where(p => string.Equals(p.Cluster, cluster, StringComparison.OrdinalIgnoreCase)); // 3. If policy has Tags, filter to peers that have ALL required tags. if (policy?.Tags is { Count: > 0 } requiredTags) candidates = candidates.Where(p => requiredTags.All(tag => p.Tags.Contains(tag))); // 4. If policy has ExcludeTags, filter out peers with any of those tags. if (policy?.ExcludeTags is { Count: > 0 } excludeTags) candidates = candidates.Where(p => !excludeTags.Any(tag => p.Tags.Contains(tag))); var filtered = candidates.ToList(); // 5. Separate preferred candidates from overloaded (HA-limited) ones. // Overloaded peers are deprioritized but NOT excluded entirely — they serve // as a fallback when no preferred options remain. // Go reference: jetstream_cluster.go:7428 maxHaAssets check (deprioritize). var (preferred, overloaded) = SplitByHaLimit(filtered); // 6. Sort each group by weighted score descending. // score = AvailableStorage - (CurrentAssets * assetCostWeight) // Go reference: jetstream_cluster.go:7469 sort by avail then ns. var sortedPreferred = SortByScore(preferred, assetCostWeight); var sortedOverloaded = SortByScore(overloaded, assetCostWeight); // 7. Apply UniqueTag constraint (if set) via greedy selection over the // combined ordered list: preferred first, overloaded as fallback. // Go reference: jetstream_cluster.go:7251 uniqueTagPrefix / checkUniqueTag. string? uniqueTagPrefix = policy?.UniqueTag is { Length: > 0 } ut ? ut : null; List selected; if (uniqueTagPrefix is not null) { selected = SelectWithUniqueTag( sortedPreferred, sortedOverloaded, replicas, uniqueTagPrefix); } else { // No unique-tag constraint — just take the top N from the combined list. var all = sortedPreferred.Concat(sortedOverloaded).ToList(); if (all.Count < replicas) ThrowInsufficientPeers(replicas, all.Count); selected = all.Take(replicas).Select(p => p.PeerId).ToList(); } // 8. Return RaftGroup with selected peer IDs. return new RaftGroup { Name = groupName, Peers = selected, }; } // --------------------------------------------------------------- // Private helpers // --------------------------------------------------------------- /// /// Splits candidates into (preferred, overloaded) where overloaded peers have /// CurrentAssets >= MaxAssetsPerPeer (when MaxAssetsPerPeer > 0). /// private static (List preferred, List overloaded) SplitByHaLimit( List peers) { var preferred = new List(peers.Count); var overloaded = new List(); foreach (var p in peers) { bool isOverloaded = p.MaxAssetsPerPeer > 0 && p.CurrentAssets >= p.MaxAssetsPerPeer; if (isOverloaded) overloaded.Add(p); else preferred.Add(p); } return (preferred, overloaded); } /// /// Sorts peers by weighted score descending. /// score = AvailableStorage - (CurrentAssets * assetCostWeight) /// private static List SortByScore(List peers, long assetCostWeight) => [.. peers.OrderByDescending(p => p.AvailableStorage - (p.CurrentAssets * assetCostWeight))]; /// /// Greedy unique-tag selection. /// Iterates over preferred peers first, then overloaded peers as fallback. /// For each candidate, the peer is accepted only if no previously selected peer /// shares the same value for the tag that starts with . /// A peer without any matching tag is rejected (same as Go behaviour: the prefix /// must be present). /// Go reference: jetstream_cluster.go:7263 checkUniqueTag. /// private static List SelectWithUniqueTag( IEnumerable preferred, IEnumerable overloaded, int replicas, string uniqueTagPrefix) { var seenTagValues = new HashSet(StringComparer.OrdinalIgnoreCase); var selected = new List(replicas); TrySelect(preferred); if (selected.Count < replicas) TrySelect(overloaded); if (selected.Count < replicas) ThrowInsufficientPeers(replicas, selected.Count); return selected; void TrySelect(IEnumerable source) { foreach (var peer in source) { if (selected.Count >= replicas) break; // Find the first tag on this peer that starts with the unique prefix. // Go reference: jetstream_cluster.go:7264 strings.HasPrefix(t, uniqueTagPrefix) string? tagValue = null; foreach (var tag in peer.Tags) { if (tag.StartsWith(uniqueTagPrefix, StringComparison.OrdinalIgnoreCase)) { tagValue = tag; break; } } // Peer has no matching tag → reject (same as Go: "unique prefix not present"). if (tagValue is null) continue; // Tag value already claimed by a previously selected peer → reject. if (!seenTagValues.Add(tagValue)) continue; selected.Add(peer.PeerId); } } } private static void ThrowInsufficientPeers(int required, int available) => throw new InvalidOperationException( $"Not enough peers available to satisfy replica count {required}. " + $"Available after policy filtering: {available}."); } /// /// Describes a peer node available for placement consideration. /// Go reference: jetstream_cluster.go peerInfo — peer.id, peer.offline, peer.storage, /// peer HAAssets, peer streams count. /// public sealed class PeerInfo { /// Unique peer identifier used in RAFT group membership. public required string PeerId { get; init; } /// Cluster name/partition where this peer resides. public string Cluster { get; set; } = string.Empty; /// Capability and topology tags advertised by this peer. public HashSet Tags { get; init; } = new(StringComparer.OrdinalIgnoreCase); /// Whether this peer is currently eligible for new assignments. public bool Available { get; set; } = true; /// Approximate remaining storage available for new assets. public long AvailableStorage { get; set; } = long.MaxValue; /// /// Number of assets (streams/consumers) currently assigned to this peer. /// Go reference: jetstream_cluster.go:7311 peerStreams / peerHA maps. /// public int CurrentAssets { get; set; } /// /// Maximum HA assets this peer should host before being deprioritized. /// 0 means unlimited (no deprioritization). /// Go reference: jetstream_cluster.go:7328 maxHaAssets. /// public int MaxAssetsPerPeer { get; set; } } /// /// Placement policy specifying cluster affinity, tag constraints, and unique-tag enforcement. /// Go reference: jetstream_cluster.go Placement struct — cluster, tags; /// server opts JetStreamUniqueTag. /// public sealed class PlacementPolicy { /// Optional cluster affinity constraint. public string? Cluster { get; set; } /// Required tags that must all be present on a candidate peer. public HashSet? Tags { get; set; } /// Tags that disqualify a candidate peer when present. public HashSet? ExcludeTags { get; set; } /// /// Tag prefix used to enforce AZ/rack uniqueness across replicas. /// When set, no two replicas may share the same value for a tag that starts /// with this prefix (e.g., "az" matches "az:us-east-1a"). /// Null or empty string disables the constraint. /// Go reference: jetstream_cluster.go:7251 JetStreamUniqueTag / uniqueTagPrefix. /// public string? UniqueTag { get; set; } }