Initialize CBDD solution and add a .NET-focused gitignore for generated artifacts.

This commit is contained in:
Joseph Doherty
2026-02-20 12:54:07 -05:00
commit b8ed5ec500
214 changed files with 101452 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
using System.Runtime.InteropServices;
using ZB.MOM.WW.CBDD.Core.Indexing;
namespace ZB.MOM.WW.CBDD.Core.Storage;
/// <summary>
/// Page for storing HNSW Vector Index nodes.
/// Each page stores a fixed number of nodes based on vector dimensions and M.
/// </summary>
public struct VectorPage
{
// Layout:
// [PageHeader (32)]
// [Dimensions (4)]
// [MaxM (4)]
// [NodeSize (4)]
// [NodeCount (4)]
// [Nodes Data (Contiguous)...]
private const int DimensionsOffset = 32;
private const int MaxMOffset = 36;
private const int NodeSizeOffset = 40;
private const int NodeCountOffset = 44;
private const int DataOffset = 48;
/// <summary>
/// Increments the node count stored in the vector page header.
/// </summary>
/// <param name="page">The page buffer.</param>
public static void IncrementNodeCount(Span<byte> page)
{
int count = GetNodeCount(page);
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeCountOffset), count + 1);
}
/// <summary>
/// Initializes a vector page with header metadata and sizing information.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <param name="pageId">The page identifier.</param>
/// <param name="dimensions">The vector dimensionality.</param>
/// <param name="maxM">The HNSW max connections parameter.</param>
public static void Initialize(Span<byte> page, uint pageId, int dimensions, int maxM)
{
var header = new PageHeader
{
PageId = pageId,
PageType = PageType.Vector,
FreeBytes = (ushort)(page.Length - DataOffset),
NextPageId = 0,
TransactionId = 0
};
header.WriteTo(page);
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(DimensionsOffset), dimensions);
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(MaxMOffset), maxM);
// Node Size Calculation:
// Location (6) + MaxLevel (1) + Vector (dim * 4) + Links (maxM * 10 * 6) -- estimating 10 levels for simplicity
// Better: Node size is variable? No, let's keep it fixed per index configuration to avoid fragmentation.
// HNSW standard: level 0 has 2*M links, levels > 0 have M links.
// Max level is typically < 16. Let's reserve space for 16 levels.
int nodeSize = 6 + 1 + (dimensions * 4) + (maxM * (2 + 15) * 6);
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeSizeOffset), nodeSize);
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeCountOffset), 0);
}
/// <summary>
/// Gets the number of nodes currently stored in the page.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <returns>The node count.</returns>
public static int GetNodeCount(ReadOnlySpan<byte> page) =>
System.Buffers.Binary.BinaryPrimitives.ReadInt32LittleEndian(page.Slice(NodeCountOffset));
/// <summary>
/// Gets the configured node size for the page.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <returns>The node size in bytes.</returns>
public static int GetNodeSize(ReadOnlySpan<byte> page) =>
System.Buffers.Binary.BinaryPrimitives.ReadInt32LittleEndian(page.Slice(NodeSizeOffset));
/// <summary>
/// Gets the maximum number of nodes that can fit in the page.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <returns>The maximum node count.</returns>
public static int GetMaxNodes(ReadOnlySpan<byte> page) =>
(page.Length - DataOffset) / GetNodeSize(page);
/// <summary>
/// Writes a node to the page at the specified index.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <param name="nodeIndex">The zero-based node index.</param>
/// <param name="loc">The document location for the node.</param>
/// <param name="maxLevel">The maximum graph level for the node.</param>
/// <param name="vector">The vector values to store.</param>
/// <param name="dimensions">The vector dimensionality.</param>
public static void WriteNode(Span<byte> page, int nodeIndex, DocumentLocation loc, int maxLevel, ReadOnlySpan<float> vector, int dimensions)
{
int nodeSize = GetNodeSize(page);
int offset = DataOffset + (nodeIndex * nodeSize);
var nodeSpan = page.Slice(offset, nodeSize);
// 1. Document Location
loc.WriteTo(nodeSpan.Slice(0, 6));
// 2. Max Level
nodeSpan[6] = (byte)maxLevel;
// 3. Vector
var vectorSpan = MemoryMarshal.Cast<byte, float>(nodeSpan.Slice(7, dimensions * 4));
vector.CopyTo(vectorSpan);
// 4. Links (initialize with 0/empty)
// Links follow the vector. Level 0: 2*M links, Level 1..15: M links.
// For now, just ensure it's cleared or handled by the indexer.
}
/// <summary>
/// Reads node metadata and vector data from the page.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <param name="nodeIndex">The zero-based node index.</param>
/// <param name="loc">When this method returns, contains the node document location.</param>
/// <param name="maxLevel">When this method returns, contains the node max level.</param>
/// <param name="vector">The destination span for vector values.</param>
public static void ReadNodeData(ReadOnlySpan<byte> page, int nodeIndex, out DocumentLocation loc, out int maxLevel, Span<float> vector)
{
int nodeSize = GetNodeSize(page);
int offset = DataOffset + (nodeIndex * nodeSize);
var nodeSpan = page.Slice(offset, nodeSize);
loc = DocumentLocation.ReadFrom(nodeSpan.Slice(0, 6));
maxLevel = nodeSpan[6];
var vectorSource = MemoryMarshal.Cast<byte, float>(nodeSpan.Slice(7, vector.Length * 4));
vectorSource.CopyTo(vector);
}
/// <summary>
/// Gets the span that stores links for a node at a specific level.
/// </summary>
/// <param name="page">The page buffer.</param>
/// <param name="nodeIndex">The zero-based node index.</param>
/// <param name="level">The graph level.</param>
/// <param name="dimensions">The vector dimensionality.</param>
/// <param name="maxM">The HNSW max connections parameter.</param>
/// <returns>The span representing the link storage region.</returns>
public static Span<byte> GetLinksSpan(Span<byte> page, int nodeIndex, int level, int dimensions, int maxM)
{
int nodeSize = GetNodeSize(page);
int nodeOffset = DataOffset + (nodeIndex * nodeSize);
// Link offset: Location(6) + MaxLevel(1) + Vector(dim*4)
int linkBaseOffset = nodeOffset + 7 + (dimensions * 4);
int levelOffset;
if (level == 0)
{
levelOffset = 0;
}
else
{
// Level 0 has 2*M links
levelOffset = (2 * maxM * 6) + ((level - 1) * maxM * 6);
}
int count = (level == 0) ? (2 * maxM) : maxM;
return page.Slice(linkBaseOffset + levelOffset, count * 6);
}
}