175 lines
7.2 KiB
C#
Executable File
175 lines
7.2 KiB
C#
Executable File
using System.Runtime.InteropServices;
|
|
using ZB.MOM.WW.CBDD.Core.Indexing;
|
|
|
|
namespace ZB.MOM.WW.CBDD.Core.Storage;
|
|
|
|
/// <summary>
|
|
/// Page for storing HNSW Vector Index nodes.
|
|
/// Each page stores a fixed number of nodes based on vector dimensions and M.
|
|
/// </summary>
|
|
public struct VectorPage
|
|
{
|
|
// Layout:
|
|
// [PageHeader (32)]
|
|
// [Dimensions (4)]
|
|
// [MaxM (4)]
|
|
// [NodeSize (4)]
|
|
// [NodeCount (4)]
|
|
// [Nodes Data (Contiguous)...]
|
|
|
|
private const int DimensionsOffset = 32;
|
|
private const int MaxMOffset = 36;
|
|
private const int NodeSizeOffset = 40;
|
|
private const int NodeCountOffset = 44;
|
|
private const int DataOffset = 48;
|
|
|
|
/// <summary>
|
|
/// Increments the node count stored in the vector page header.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
public static void IncrementNodeCount(Span<byte> page)
|
|
{
|
|
int count = GetNodeCount(page);
|
|
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeCountOffset), count + 1);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Initializes a vector page with header metadata and sizing information.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <param name="pageId">The page identifier.</param>
|
|
/// <param name="dimensions">The vector dimensionality.</param>
|
|
/// <param name="maxM">The HNSW max connections parameter.</param>
|
|
public static void Initialize(Span<byte> page, uint pageId, int dimensions, int maxM)
|
|
{
|
|
var header = new PageHeader
|
|
{
|
|
PageId = pageId,
|
|
PageType = PageType.Vector,
|
|
FreeBytes = (ushort)(page.Length - DataOffset),
|
|
NextPageId = 0,
|
|
TransactionId = 0
|
|
};
|
|
header.WriteTo(page);
|
|
|
|
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(DimensionsOffset), dimensions);
|
|
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(MaxMOffset), maxM);
|
|
|
|
// Node Size Calculation:
|
|
// Location (6) + MaxLevel (1) + Vector (dim * 4) + Links (maxM * 10 * 6) -- estimating 10 levels for simplicity
|
|
// Better: Node size is variable? No, let's keep it fixed per index configuration to avoid fragmentation.
|
|
// HNSW standard: level 0 has 2*M links, levels > 0 have M links.
|
|
// Max level is typically < 16. Let's reserve space for 16 levels.
|
|
int nodeSize = 6 + 1 + (dimensions * 4) + (maxM * (2 + 15) * 6);
|
|
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeSizeOffset), nodeSize);
|
|
System.Buffers.Binary.BinaryPrimitives.WriteInt32LittleEndian(page.Slice(NodeCountOffset), 0);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the number of nodes currently stored in the page.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <returns>The node count.</returns>
|
|
public static int GetNodeCount(ReadOnlySpan<byte> page) =>
|
|
System.Buffers.Binary.BinaryPrimitives.ReadInt32LittleEndian(page.Slice(NodeCountOffset));
|
|
|
|
/// <summary>
|
|
/// Gets the configured node size for the page.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <returns>The node size in bytes.</returns>
|
|
public static int GetNodeSize(ReadOnlySpan<byte> page) =>
|
|
System.Buffers.Binary.BinaryPrimitives.ReadInt32LittleEndian(page.Slice(NodeSizeOffset));
|
|
|
|
/// <summary>
|
|
/// Gets the maximum number of nodes that can fit in the page.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <returns>The maximum node count.</returns>
|
|
public static int GetMaxNodes(ReadOnlySpan<byte> page) =>
|
|
(page.Length - DataOffset) / GetNodeSize(page);
|
|
|
|
/// <summary>
|
|
/// Writes a node to the page at the specified index.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <param name="nodeIndex">The zero-based node index.</param>
|
|
/// <param name="loc">The document location for the node.</param>
|
|
/// <param name="maxLevel">The maximum graph level for the node.</param>
|
|
/// <param name="vector">The vector values to store.</param>
|
|
/// <param name="dimensions">The vector dimensionality.</param>
|
|
public static void WriteNode(Span<byte> page, int nodeIndex, DocumentLocation loc, int maxLevel, ReadOnlySpan<float> vector, int dimensions)
|
|
{
|
|
int nodeSize = GetNodeSize(page);
|
|
int offset = DataOffset + (nodeIndex * nodeSize);
|
|
var nodeSpan = page.Slice(offset, nodeSize);
|
|
|
|
// 1. Document Location
|
|
loc.WriteTo(nodeSpan.Slice(0, 6));
|
|
|
|
// 2. Max Level
|
|
nodeSpan[6] = (byte)maxLevel;
|
|
|
|
// 3. Vector
|
|
var vectorSpan = MemoryMarshal.Cast<byte, float>(nodeSpan.Slice(7, dimensions * 4));
|
|
vector.CopyTo(vectorSpan);
|
|
|
|
// 4. Links (initialize with 0/empty)
|
|
// Links follow the vector. Level 0: 2*M links, Level 1..15: M links.
|
|
// For now, just ensure it's cleared or handled by the indexer.
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads node metadata and vector data from the page.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <param name="nodeIndex">The zero-based node index.</param>
|
|
/// <param name="loc">When this method returns, contains the node document location.</param>
|
|
/// <param name="maxLevel">When this method returns, contains the node max level.</param>
|
|
/// <param name="vector">The destination span for vector values.</param>
|
|
public static void ReadNodeData(ReadOnlySpan<byte> page, int nodeIndex, out DocumentLocation loc, out int maxLevel, Span<float> vector)
|
|
{
|
|
int nodeSize = GetNodeSize(page);
|
|
int offset = DataOffset + (nodeIndex * nodeSize);
|
|
var nodeSpan = page.Slice(offset, nodeSize);
|
|
|
|
loc = DocumentLocation.ReadFrom(nodeSpan.Slice(0, 6));
|
|
maxLevel = nodeSpan[6];
|
|
|
|
var vectorSource = MemoryMarshal.Cast<byte, float>(nodeSpan.Slice(7, vector.Length * 4));
|
|
vectorSource.CopyTo(vector);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the span that stores links for a node at a specific level.
|
|
/// </summary>
|
|
/// <param name="page">The page buffer.</param>
|
|
/// <param name="nodeIndex">The zero-based node index.</param>
|
|
/// <param name="level">The graph level.</param>
|
|
/// <param name="dimensions">The vector dimensionality.</param>
|
|
/// <param name="maxM">The HNSW max connections parameter.</param>
|
|
/// <returns>The span representing the link storage region.</returns>
|
|
public static Span<byte> GetLinksSpan(Span<byte> page, int nodeIndex, int level, int dimensions, int maxM)
|
|
{
|
|
int nodeSize = GetNodeSize(page);
|
|
int nodeOffset = DataOffset + (nodeIndex * nodeSize);
|
|
|
|
// Link offset: Location(6) + MaxLevel(1) + Vector(dim*4)
|
|
int linkBaseOffset = nodeOffset + 7 + (dimensions * 4);
|
|
|
|
int levelOffset;
|
|
if (level == 0)
|
|
{
|
|
levelOffset = 0;
|
|
}
|
|
else
|
|
{
|
|
// Level 0 has 2*M links
|
|
levelOffset = (2 * maxM * 6) + ((level - 1) * maxM * 6);
|
|
}
|
|
|
|
int count = (level == 0) ? (2 * maxM) : maxM;
|
|
return page.Slice(linkBaseOffset + levelOffset, count * 6);
|
|
}
|
|
}
|