From 52445078a110e12d0b02d7424b6f4cdf673c6105 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 20 Feb 2026 13:28:29 -0500 Subject: [PATCH] Add enterprise docs structure and include pending core maintenance updates. --- .gitignore | 1 + BENCHMARKS.md | 5 - README.md | 430 +++----- docs/README.md | 25 + docs/access.md | 32 + ...01-storage-engine-and-source-generation.md | 32 + docs/architecture.md | 46 + docs/deployment.md | 60 ++ docs/features/README.md | 10 + docs/features/change-data-capture.md | 61 ++ docs/features/geospatial-search.md | 60 ++ docs/features/query-and-indexing.md | 62 ++ docs/features/source-generated-mapping.md | 61 ++ docs/features/storage-transactions.md | 63 ++ docs/features/vector-search.md | 60 ++ docs/runbook.md | 56 + docs/security.md | 34 + docs/troubleshooting.md | 62 ++ .../Collections/DocumentCollection.cs | 100 +- src/CBDD.Core/Indexing/BTreeIndex.cs | 12 + .../Indexing/CollectionIndexManager.cs | 82 +- src/CBDD.Core/Storage/PageFile.cs | 33 + .../Storage/StorageEngine.Maintenance.cs | 973 +++++++++++++++++- 23 files changed, 1956 insertions(+), 404 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/access.md create mode 100644 docs/adr/0001-storage-engine-and-source-generation.md create mode 100644 docs/architecture.md create mode 100644 docs/deployment.md create mode 100644 docs/features/README.md create mode 100644 docs/features/change-data-capture.md create mode 100644 docs/features/geospatial-search.md create mode 100644 docs/features/query-and-indexing.md create mode 100644 docs/features/source-generated-mapping.md create mode 100644 docs/features/storage-transactions.md create mode 100644 docs/features/vector-search.md create mode 100644 docs/runbook.md create mode 100644 docs/security.md create mode 100644 docs/troubleshooting.md diff --git a/.gitignore b/.gitignore index 16b8b90..7b8a01a 100755 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ yarn-error.log* generated/ generated-data/ data/generated/ +.tmp*/ output/ out/ reports/ diff --git a/BENCHMARKS.md b/BENCHMARKS.md index bdb7551..af4d7a5 100755 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -173,8 +173,3 @@ dotnet run -c Release --project tests/CBDD.Tests.Benchmark | 'DocumentDb Single Insert' | 355.8 Ξs | 19.42 Ξs | 56.65 Ξs | 0.12 | 128.89 KB | ``` ---- - -## License - -CBDD is licensed under the MIT License. See [LICENSE](LICENSE) for details. diff --git a/README.md b/README.md index 3cebc4d..e54f43e 100755 --- a/README.md +++ b/README.md @@ -1,314 +1,132 @@ -# ⚡ CBDD -### High-Performance BSON Database Engine for .NET 10 - -![Build Status](https://img.shields.io/badge/build-passing-brightgreen) -![License](https://img.shields.io/badge/license-MIT-blue) -![Platform](https://img.shields.io/badge/platform-.NET%2010-purple) -![Status](https://img.shields.io/badge/status-active%20development-orange) - -**CBDD** is an embedded, ACID-compliant, document-oriented database built from scratch for **maximum performance** and **zero allocation**. It leverages modern .NET features like `Span`, `Memory`, and Source Generators to eliminate runtime overhead. - -> **Note**: Currently targets **.NET 10** to maximize performance with `Span` and modern hardware intrinsics. Future support for `.netstandard2.1` is being evaluated. - ---- - -## 🚀 Why CBDD? - -Most embedded databases for .NET are either wrappers around C libraries (SQLite, RocksDB) or legacy C# codebases burdened by heavy GC pressure. - -**CBDD is different:** -- **Zero Allocation**: I/O and interaction paths use `Span` and `stackalloc`. No heap allocations for reads/writes. -- **Type-Safe**: No reflection. All serialization code is generated at compile-time. -- **Developer Experience**: Full LINQ provider (`IQueryable`) that feels like Entity Framework but runs on bare metal. -- **Reliable**: Full ACID transactions with Write-Ahead Logging (WAL) and Snapshot Isolation. - ---- - -## âœĻ Key Features - -### 🚄 Zero-Allocation Architecture -- **Span-based I/O**: The entire pipeline, from disk to user objects, utilizes `Span` to avoid copying memory. -- **Memory-Mapped Files**: OS-level paging and caching for blazing fast access. - -### 🧠 Powerful Query Engine (LINQ) -Write queries naturally using LINQ. The engine automatically translates them to optimized B-Tree lookups. - -```csharp -// Automatic Index Usage -var users = collection.AsQueryable() - .Where(x => x.Age > 25 && x.Name.StartsWith("A")) - .OrderBy(x => x.Age) - .Take(10) - .AsEnumerable(); // Executed efficiently on the engine -``` - -- **Optimized**: Uses B-Tree indexes for `=`, `>`, `<`, `Between`, and `StartsWith`. -- **Hybrid Execution**: Combines storage-level optimization with in-memory LINQ to Objects. -- **Advanced Features**: Full support for `GroupBy`, `Join`, `Select` (including anonymous types), and Aggregations (`Count`, `Sum`, `Min`, `Max`, `Average`). - -### 🔍 Advanced Indexing -- **B-Tree Indexes**: Logarithmic time complexity for lookups. -- **Composite Indexes**: Support for multi-column keys. -- **Vector Search (HNSW)**: Fast similarity search for AI embeddings using Hierarchical Navigable Small World algorithm. - -### ðŸĪ– AI-Ready Vector Search -CBDD natively supports vector embeddings and fast similarity search. - -```csharp -// 1. Configure vector index on float[] property -modelBuilder.Entity() - .HasVectorIndex(x => x.Embedding, dimensions: 1536, metric: VectorMetric.Cosine); - -// 2. Perform fast similarity search -var results = db.Items.AsQueryable() - .VectorSearch(x => x.Embedding, queryVector, k: 5) - .ToList(); -``` - -### 🌍 High-Performance Geospatial Indexing -CBDD features a built-in R-Tree implementation for lightning-fast proximity and bounding box searches. - -- **Zero-Allocation**: Uses coordinate tuples `(double, double)` and `Span`-based BSON arrays. -- **LINQ Integrated**: Search naturally using `.Near()` and `.Within()`. - -```csharp -// 1. Configure spatial index (uses R-Tree internally) -modelBuilder.Entity() - .HasSpatialIndex(x => x.Location); - -// 2. Proximity Search (Find stores within 5km) -var stores = db.Stores.AsQueryable() - .Where(s => s.Location.Near((45.4642, 9.1899), 5.0)) - .ToList(); - -// 3. Bounding Box Search -var area = db.Stores.AsQueryable() - .Where(s => s.Location.Within((45.0, 9.0), (46.0, 10.0))) - .ToList(); -``` - -### 🆔 Custom ID Converters (ValueObjects) -Native support for custom primary key types using `ValueConverter`. Configure them easily via the Fluent API. - -```csharp -// 1. Define your ValueObject and Converter -public record OrderId(string Value); -public class OrderIdConverter : ValueConverter { ... } - -// 2. Configure in OnModelCreating -modelBuilder.Entity() - .Property(x => x.Id) - .HasConversion(); - -// 3. Use it naturally -var order = collection.FindById(new OrderId("ORD-123")); -``` - -### ðŸ“Ą Change Data Capture (CDC) -Real-time event streaming for database changes with transactional consistency. - -- **Zero-Allocation**: Events are only captured when watchers exist; no overhead when disabled. -- **Transactional**: Events fire only after successful commit, never on rollback. -- **Scalable**: Uses Channel-per-subscriber architecture to support thousands of concurrent listeners. - -```csharp -// Watch for changes in a collection -using var subscription = db.People.Watch(capturePayload: true) - .Subscribe(e => - { - Console.WriteLine($"{e.Type}: {e.DocumentId}"); - if (e.Entity != null) - Console.WriteLine($" Name: {e.Entity.Name}"); - }); - -// Perform operations - events fire after commit -db.People.Insert(new Person { Id = 1, Name = "Alice" }); -``` - -### ðŸ›Ąïļ Transactions & ACID -- **Atomic**: Multi-document transactions. -- **Durable**: WAL ensures data safety even in power loss. -- **Isolated**: Snapshot isolation allowing concurrent readers and writers. -- **Thread-Safe**: Protected with `SemaphoreSlim` to prevent race conditions in concurrent scenarios. -- **Async-First**: Full async/await support with proper `CancellationToken` handling. -- **Implicit Transactions**: Use `SaveChanges()` / `SaveChangesAsync()` for automatic transaction management (like EF Core). - -### 🔌 Intelligent Source Generation -- **Zero Reflection**: Mappers are generated at compile-time for zero overhead. -- **Nested Objects & Collections**: Full support for complex graphs, deep nesting, and ref struct handling. -- **Robust Serialization**: Correctly handles nested objects, collections, and complex type hierarchies. -- **Lowercase Policy**: BSON keys are automatically persisted as `lowercase` for consistency. -- **Custom Overrides**: Use `[BsonProperty]` or `[JsonPropertyName]` for manual field naming. - -#### ✅ Supported Scenarios - -The source generator handles a wide range of modern C# patterns: - -| Feature | Support | Description | -| :--- | :---: | :--- | -| **Property Inheritance** | ✅ | Properties from base classes are automatically included in serialization | -| **Private Setters** | ✅ | Properties with `private set` are correctly deserialized using Expression Trees | -| **Init-Only Setters** | ✅ | Properties with `init` are supported via runtime compilation | -| **Private Constructors** | ✅ | Deserialization works even without parameterless public constructor | -| **Advanced Collections** | ✅ | `IEnumerable`, `ICollection`, `IList`, `HashSet`, and more | -| **Nullable Value Types** | ✅ | `ObjectId?`, `int?`, `DateTime?` are correctly serialized/deserialized | -| **Nullable Collections** | ✅ | `List?`, `string?` with proper null handling | -| **Unlimited Nesting** | ✅ | Deeply nested object graphs with circular reference protection | -| **Self-Referencing** | ✅ | Entities can reference themselves (e.g., `Manager` property in `Employee`) | -| **N-N Relationships** | ✅ | Collections of ObjectIds for efficient document referencing | - -#### ❌ Limitations & Design Choices - -| Scenario | Status | Reason | -| :--- | :---: | :--- | -| **Computed Properties** | ⚠ïļ Excluded | Getter-only properties without backing fields are intentionally skipped (e.g., `FullName => $"{First} {Last}"`) | -| **Constructor Logic** | ⚠ïļ Bypassed | Deserialization uses `RuntimeHelpers.GetUninitializedObject()` to avoid constructor execution | -| **Constructor Validation** | ⚠ïļ Not Executed | Validation logic in constructors won't run during deserialization - use Data Annotations instead | - -> **ðŸ’Ą Best Practice**: For relationships between entities, prefer **referencing** (storing ObjectIds) over **embedding** (full nested objects) to avoid data duplication and maintain consistency. See tests in `CircularReferenceTests.cs` for implementation patterns. - -### 🏷ïļ Supported Attributes -CBDD supports standard .NET Data Annotations for mapping and validation: - -| Attribute | Category | Description | -| :--- | :--- | :--- | -| `[Table("name")]` | Mapping | Sets the collection name. Supports `Schema="s"` for `s.name` grouping. | -| `[Column("name")]` | Mapping | Maps property to a specific BSON field name. | -| `[Column(TypeName="...")]`| Mapping | Handles special types (e.g., `geopoint` for coordinate tuples). | -| `[Key]` | Identity | Explicitly marks the primary key (maps to `_id`). | -| `[NotMapped]` | Mapping | Excludes property from BSON serialization. | -| `[Required]` | Validation | Ensures string is not null/empty or nullable type is not null. | -| `[StringLength(max)]` | Validation | Validates string length (supports `MinimumLength`). | -| `[MaxLength(n)]` | Validation | Validates maximum string length. | -| `[MinLength(n)]` | Validation | Validates minimum string length. | -| `[Range(min, max)]` | Validation | Validates numeric values stay within the specified range. | - -> [!IMPORTANT] -> Validation attributes (`[Required]`, `[Range]`, etc.) throw a `System.ComponentModel.DataAnnotations.ValidationException` during serialization if rules are violated. - ---- - -## 📚 Documentation +# CBDD -For in-depth technical details, see the complete specification documents: - -- **[RFC.md](RFC.md)** - Full architectural specification covering storage engine, indexing, transactions, WAL protocol, and query processing -- **[C-BSON.md](C-BSON.md)** - Detailed wire format specification for CBDD's Compressed BSON format, including hex dumps and performance analysis +CBDD is an embedded, document-oriented database engine for .NET 10. It targets internal platform teams that need predictable ACID behavior, low-latency local persistence, and typed access patterns without running an external database server. -## ✅ Fitness Check +## Purpose And Business Context -Run the repository fitness suite locally: +CBDD provides a local data layer for services and tools that need transactional durability, deterministic startup, and high-throughput reads/writes. The primary business outcome is reducing operational overhead for workloads that do not require a networked database cluster. + +## Ownership And Support + +- Owning team: CBDD maintainers (repository owner: `@dohertj2`) +- Primary support path: open a Gitea issue in this repository with labels `incident` or `bug` +- Escalation path: follow `/Users/dohertj2/Desktop/CBDD/docs/runbook.md` and page the release maintainer listed in the active release PR + +## Architecture Overview + +CBDD has four primary layers: + +1. Storage and transaction engine (`/Users/dohertj2/Desktop/CBDD/src/CBDD.Core/Storage`, `/Users/dohertj2/Desktop/CBDD/src/CBDD.Core/Transactions`) +2. BSON serialization (`/Users/dohertj2/Desktop/CBDD/src/CBDD.Bson`) +3. Indexing and query execution (`/Users/dohertj2/Desktop/CBDD/src/CBDD.Core/Indexing`, `/Users/dohertj2/Desktop/CBDD/src/CBDD.Core/Query`) +4. Source-generated mapping (`/Users/dohertj2/Desktop/CBDD/src/CBDD.SourceGenerators`) + +Detailed architecture material: +- `/Users/dohertj2/Desktop/CBDD/docs/architecture.md` +- `/Users/dohertj2/Desktop/CBDD/RFC.md` +- `/Users/dohertj2/Desktop/CBDD/C-BSON.md` + +## Prerequisites + +- .NET SDK 10.x +- Bash (for repository scripts) +- Read/write permissions for the local working directory +- Gitea access for pull requests and release publishing + +## Setup And Local Run + +1. Clone the repository. +```bash +git clone https://gitea.dohertylan.com/dohertj2/CBDD.git +cd CBDD +``` +Expected outcome: local repository checkout with `CBDD.slnx` present. + +2. Restore dependencies. +```bash +dotnet restore +``` +Expected outcome: restore completes without package errors. + +3. Build the solution. +```bash +dotnet build CBDD.slnx -c Release +``` +Expected outcome: solution builds without compiler errors. + +4. Run tests. +```bash +dotnet test CBDD.slnx -c Release +``` +Expected outcome: all tests pass. + +5. Run the full repository fitness check. +```bash +bash scripts/fitness-check.sh +``` +Expected outcome: format, build, tests, coverage threshold, and package checks complete. + +## Configuration And Secrets + +- Default local usage requires only a filesystem path for the database file. +- Do not commit secrets in source, test fixtures, benchmark assets, or `.env` files. +- If publishing packages, keep feed credentials in CI secrets or local keychain-backed credential storage. +- Store environment-specific values outside the repository and inject them at runtime. + +## Build, Test, And Quality Gates + +Required pre-merge commands: + +```bash +dotnet format --verify-no-changes +dotnet build -t:Rebuild +dotnet test +bash scripts/coverage-check.sh +``` + +Optional full gate: ```bash bash scripts/fitness-check.sh ``` -It verifies formatting, build/test health, and package risk checks. - ---- - -## ðŸ“Ķ Quick Start - -### 1. Installation -``` -dotnet add package ZB.MOM.WW.CBDD -``` - -### 2. Basic Usage - -```csharp -// 1. Define your Entities -public class User -{ - public ObjectId Id { get; set; } - public string Name { get; set; } -} - -// 2. Define your DbContext (Source Generator will produce InitializeCollections) -public partial class MyDbContext : DocumentDbContext -{ - public DocumentCollection Users { get; set; } = null!; - - public MyDbContext(string path) : base(path) - { - InitializeCollections(); - } -} - -// 3. Use with Implicit Transactions (Recommended) -using var db = new MyDbContext("mydb.db"); - -// Operations are tracked automatically -db.Users.Insert(new User { Name = "Alice" }); -db.Users.Insert(new User { Name = "Bob" }); - -// Commit all changes at once -db.SaveChanges(); - -// 4. Query naturally with LINQ -var results = db.Users.AsQueryable() - .Where(u => u.Name.StartsWith("A")) - .AsEnumerable(); - -// 5. Or use explicit transactions for fine-grained control -using (var txn = db.BeginTransaction()) -{ - db.Users.Insert(new User { Name = "Charlie" }); - txn.Commit(); // Explicit commit -} -``` - ---- - -## 🗚ïļ Roadmap & Status - -We are actively building the core. Here is where we stand: - -- ✅ **Core Storage**: Paged I/O, WAL, Transactions with thread-safe concurrent access. -- ✅ **BSON Engine**: Zero-copy Reader/Writer with lowercase policy. -- ✅ **Indexing**: B-Tree implementation. -- ✅ **Vector Search**: HNSW implementation for Similarity Search. -- ✅ **Geospatial Indexing**: Optimized R-Tree with zero-allocation tuple API. -- ✅ **Query Engine**: Hybrid execution (Index/Scan + LINQ to Objects). -- ✅ **Advanced LINQ**: GroupBy, Joins, Aggregations, Complex Projections. -- ✅ **Async I/O**: Full `async`/`await` support with proper `CancellationToken` handling. -- ✅ **Source Generators**: Auto-map POCO/DDD classes with robust nested objects, collections, and ref struct support. - -## ðŸ”Ū Future Vision - -### 1. Advanced Querying & Specialized Indices -- **Graph Traversals**: - - Specialized index for "links" (Document IDs) for $O(1)$ navigation without full scans. - -### 2. CDC & Event Integration -- **BSON Change Stream**: "Log Miner" that decodes WAL entries and emits structured events. -- **Internal Dispatcher**: Keeps specialized indices updated automatically via CDC. - -### 3. Performance & Optimization -- **Projection Engine**: Read only specific fields from disk (via BSON offsets) without full document deserialization. -- **Portability**: Evaluate `.netstandard2.1` support for broader compatibility (Unity, MAUI, etc.). - ---- - -## ðŸĪ Contributing - -We welcome contributions! This is a great project to learn about database internals, B-Trees, and high-performance .NET. - -### How to Build -1. **Clone**: `git clone https://github.com/mrdevrobot/CBDD.git` -2. **Build**: `dotnet build` -3. **Test**: `dotnet test` (We have comprehensive tests for Storage, Indexing, and LINQ). -4. **Coverage Gate**: `bash scripts/coverage-check.sh` - -### Areas to Contribute -- **Missing LINQ Operators**: Help us implement additional `IQueryable` functions. -- **Benchmarks**: Help us prove `CBDD` is faster than the competition. -- **Documentation**: Examples, Guides, and Wiki. - ---- - -## 📝 License - -Licensed under the MIT License. Use it freely in personal and commercial projects. - - +## Deployment And Rollback + +CBDD is released as an internal package. + +- Deployment workflow: `/Users/dohertj2/Desktop/CBDD/docs/deployment.md` +- Rollback workflow: `/Users/dohertj2/Desktop/CBDD/docs/deployment.md#rollback-procedure` + +## Operations And Incident Response + +Operational procedures, diagnostics, and escalation are documented in: + +- `/Users/dohertj2/Desktop/CBDD/docs/runbook.md` +- `/Users/dohertj2/Desktop/CBDD/docs/troubleshooting.md` + +## Security And Compliance Posture + +- CBDD relies on host and process-level access controls. +- Sensitive payload classification and handling requirements are defined in `/Users/dohertj2/Desktop/CBDD/docs/security.md`. +- Role and approval requirements are defined in `/Users/dohertj2/Desktop/CBDD/docs/access.md`. + +## Troubleshooting + +Common issues and remediation: + +- Build/test environment failures: `/Users/dohertj2/Desktop/CBDD/docs/troubleshooting.md#build-and-test-failures` +- Data-file recovery procedures: `/Users/dohertj2/Desktop/CBDD/docs/troubleshooting.md#data-file-and-recovery-issues` +- Query/index behavior verification: `/Users/dohertj2/Desktop/CBDD/docs/troubleshooting.md#query-and-index-issues` + +## Change Governance + +- Use feature branches from `main`. +- Open pull requests with at least one reviewer approval before merge. +- Keep release notes in pull request descriptions and tagged release notes. +- Run documented quality gates before requesting review. + +## Documentation Index + +- Documentation home: `/Users/dohertj2/Desktop/CBDD/docs/README.md` +- Major feature inventory: `/Users/dohertj2/Desktop/CBDD/docs/features/README.md` +- Architecture decisions: `/Users/dohertj2/Desktop/CBDD/docs/adr/0001-storage-engine-and-source-generation.md` diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..95208fb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,25 @@ +# CBDD Documentation + +This folder is the canonical source for internal operational and engineering documentation. + +## Core Documents + +- Architecture: [`architecture.md`](architecture.md) +- Deployment: [`deployment.md`](deployment.md) +- Operations runbook: [`runbook.md`](runbook.md) +- Security controls: [`security.md`](security.md) +- Access model: [`access.md`](access.md) +- Troubleshooting guide: [`troubleshooting.md`](troubleshooting.md) + +## Major Features + +- Feature inventory: [`features/README.md`](features/README.md) + +## Architecture Decisions + +- Initial ADR: [`adr/0001-storage-engine-and-source-generation.md`](adr/0001-storage-engine-and-source-generation.md) + +## Reference Specifications + +- Engine RFC: [`../RFC.md`](../RFC.md) +- C-BSON format spec: [`../C-BSON.md`](../C-BSON.md) diff --git a/docs/access.md b/docs/access.md new file mode 100644 index 0000000..2f5eb6b --- /dev/null +++ b/docs/access.md @@ -0,0 +1,32 @@ +# Access And Permissions + +## Roles + +- Maintainer: merge authority, release authority, incident ownership. +- Reviewer: approves pull requests and validates architecture/security impact. +- Contributor: proposes changes through pull requests. +- Consumer: integrates published package versions in downstream applications. + +## Least-Privilege Model + +- Limit maintainer privileges to required release and incident responders. +- Use reviewer role for routine code review and documentation updates. +- Restrict package publishing credentials to release maintainers. + +## Approval Workflow + +1. Contributor opens pull request. +2. Reviewer validates tests, documentation, and risk impact. +3. Maintainer approves merge for high-risk or release-impacting changes. +4. Release maintainer publishes approved release artifacts. + +## Periodic Access Review + +1. Review maintainer and publisher access quarterly. +2. Remove inactive accounts and obsolete credentials. +3. Confirm access ownership in repository settings and package feed controls. + +## Emergency Access + +- Temporary elevated access requires a tracked incident issue. +- Remove temporary access immediately after incident closure. diff --git a/docs/adr/0001-storage-engine-and-source-generation.md b/docs/adr/0001-storage-engine-and-source-generation.md new file mode 100644 index 0000000..dad0fe1 --- /dev/null +++ b/docs/adr/0001-storage-engine-and-source-generation.md @@ -0,0 +1,32 @@ +# 0001 Storage Engine And Source Generation + +## Status + +Accepted + +## Context + +CBDD targets embedded workloads where predictable latency and low operational overhead are priorities. Runtime reflection and remote database dependencies increase startup and runtime variance for this workload profile. + +## Decision + +1. Use an embedded storage engine with page-based persistence and WAL-backed transactions. +2. Use compile-time source generation for mapping instead of runtime reflection. +3. Keep query and indexing execution in-process for deterministic behavior. + +## Consequences + +Positive: +- Lower runtime allocation and startup overhead. +- Strong control over transaction and recovery behavior. +- Predictable deployment for local/offline workloads. + +Trade-offs: +- Greater maintenance burden for custom storage/query engine internals. +- Source generator complexity requires dedicated regression coverage. + +## Related Documents + +- [`../architecture.md`](../architecture.md) +- [`../runbook.md`](../runbook.md) +- [`../../RFC.md`](../../RFC.md) diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..0e7ea51 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,46 @@ +# Architecture + +## System Context + +CBDD is an embedded database library consumed in-process by .NET applications. The host application owns process lifecycle, filesystem placement, and operational policy. + +External dependencies: +- .NET runtime and SDK +- Local filesystem +- Optional CI and package registry for build/release + +## Containers And Major Components + +1. `CBDD.Core` +- Owns storage engine, transaction protocol, WAL, indexing, query planning, and CDC plumbing. + +2. `CBDD.Bson` +- Owns BSON document model and span-based serialization/deserialization primitives. + +3. `CBDD.SourceGenerators` +- Generates mapping code at compile time for entity serialization and collection initialization. + +4. Consumer application +- Defines entities, `DocumentDbContext` subclasses, and operational behavior. + +## Data Flow + +1. Consumer invokes collection API through `DocumentDbContext`. +2. Mapper layer serializes entities to BSON via generated mappers. +3. Storage engine writes page data and WAL entries. +4. Index subsystem updates primary and secondary indexes. +5. Transaction commit persists durable state and emits CDC notifications where enabled. +6. Query path evaluates expression plans and uses indexes or scan fallback. + +## Reliability Model + +- Write-ahead logging enforces durability before logical commit completion. +- Snapshot isolation supports concurrent reads with transactional correctness. +- Recovery logic replays WAL on restart to restore committed state. + +## Cross References + +- Operational procedures: [`runbook.md`](runbook.md) +- Deployment and rollback: [`deployment.md`](deployment.md) +- Security controls: [`security.md`](security.md) +- Detailed protocol reference: [`../RFC.md`](../RFC.md) diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 0000000..8d8aab3 --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,60 @@ +# Deployment + +## Scope + +This workflow covers releasing CBDD as an internal package and promoting a validated version for downstream consumers. + +## Promotion Path + +1. Development validation on feature branch. +2. Merge to `main` after required quality gates. +3. Tag release and publish package artifact. +4. Consumer rollout to target services/tools. + +## Pre-Deployment Checklist + +1. Run repository fitness gate. +```bash +bash scripts/fitness-check.sh +``` +2. Verify no pending incidents in current release window. +3. Confirm release notes include behavioral changes and migration notes. +4. Confirm package version bump strategy. + +## Release Procedure + +1. Build release artifacts. +```bash +dotnet build CBDD.slnx -c Release +dotnet test CBDD.slnx -c Release +``` +2. Pack publishable projects. +```bash +dotnet pack src/CBDD/ZB.MOM.WW.CBDD.csproj -c Release -o nupkgs +``` +3. Publish package to approved internal feed. +4. Create release tag and attach release notes. +5. Notify downstream teams of version and rollout guidance. + +## Post-Deployment Validation + +1. Install released package in a smoke-test consumer. +2. Validate insert, query, transaction commit, and recovery startup behavior. +3. Verify CDC, vector, and geospatial capabilities when used by consuming teams. + +## Rollback Procedure + +Trigger rollback when release validation fails or production consumers detect regression. + +1. Stop further promotions of the failing version. +2. Revert to previous known-good package version in consumer manifests. +3. If package registry supports unlisting/yanking, unlist the bad version. +4. Open incident issue with impact, timeline, and containment actions. +5. Prepare and validate patch release before re-promotion. + +## Emergency Change Path + +1. Create hotfix branch from last good tag. +2. Apply minimal fix and run full quality gates. +3. Require maintainer approval. +4. Publish patched version and communicate mandatory upgrade guidance. diff --git a/docs/features/README.md b/docs/features/README.md new file mode 100644 index 0000000..dff391d --- /dev/null +++ b/docs/features/README.md @@ -0,0 +1,10 @@ +# Feature Inventory + +The following documents are the canonical reference for major CBDD capabilities: + +- [Storage and transactions](storage-transactions.md) +- [Query and indexing](query-and-indexing.md) +- [Vector search](vector-search.md) +- [Geospatial search](geospatial-search.md) +- [Change data capture](change-data-capture.md) +- [Source-generated mapping](source-generated-mapping.md) diff --git a/docs/features/change-data-capture.md b/docs/features/change-data-capture.md new file mode 100644 index 0000000..ff1902e --- /dev/null +++ b/docs/features/change-data-capture.md @@ -0,0 +1,61 @@ +# Change Data Capture + +## Purpose And Business Outcome + +Expose transactional change events so consumers can react to committed inserts, updates, and deletes. + +## Scope And Non-Goals + +Scope: +- Collection-level change stream subscriptions +- Event publication after successful commit + +Non-goals: +- Cross-process event transport guarantees +- External message broker delivery semantics + +## User And System Workflows + +1. Consumer subscribes to a collection change stream. +2. Application performs data mutations. +3. On commit, CDC publishes events to active subscribers. +4. Subscriber handlers process entity and metadata payloads. + +## Interfaces And APIs + +- `Watch(...)` collection API +- `ChangeStreamObservable` +- `ChangeStreamDispatcher` +- `ChangeStreamEvent` + +## Permissions And Data Handling + +- CDC payloads can include document identifiers and entity data. +- Restrict subscription access to trusted application components. + +## Dependencies And Failure Modes + +Dependencies: +- Transaction commit lifecycle +- Subscriber channel health + +Failure modes: +- Subscriber backpressure or handler exceptions +- Event handling assumptions that conflict with rollback behavior + +## Monitoring, Alerts, And Troubleshooting + +- Review CDC behavior in integration and scalability tests. +- Follow [`../runbook.md`](../runbook.md) for incident response. +- Follow [`../security.md`](../security.md) for event payload handling controls. +- Use [`../troubleshooting.md`](../troubleshooting.md) for diagnosis guidance. + +## Rollout And Change Considerations + +- Behavioral CDC changes require explicit release-note callouts. +- Maintain compatibility expectations for event payload shape. + +## Validation Guidance + +- Run `CdcTests` and `CdcScalabilityTests` before release. +- Validate commit-only emission behavior in regression tests. diff --git a/docs/features/geospatial-search.md b/docs/features/geospatial-search.md new file mode 100644 index 0000000..9c5310b --- /dev/null +++ b/docs/features/geospatial-search.md @@ -0,0 +1,60 @@ +# Geospatial Search + +## Purpose And Business Outcome + +Support location-aware queries such as nearest-neighbor and bounding-box lookups for geospatial workloads. + +## Scope And Non-Goals + +Scope: +- Spatial index configuration +- Proximity and bounding-box query operations + +Non-goals: +- Full GIS projection transformations +- External map tile services + +## User And System Workflows + +1. Consumer configures spatial index for coordinate fields. +2. Coordinates are serialized and stored with entity payloads. +3. Query path evaluates `Near` or `Within` predicates. +4. Engine returns matching entities. + +## Interfaces And APIs + +- Spatial index APIs in model configuration +- Query helpers under `GeoSpatialExtensions` +- Index implementation in `RTreeIndex` + +## Permissions And Data Handling + +- Geolocation data may be regulated or privacy-sensitive. +- Apply least-privilege access and retention limits. + +## Dependencies And Failure Modes + +Dependencies: +- Correct coordinate format and units +- Spatial index consistency + +Failure modes: +- Invalid coordinate values +- Unexpected results from bounding definitions or radius units + +## Monitoring, Alerts, And Troubleshooting + +- Validate geospatial paths through dedicated stress and correctness tests. +- Use [`../runbook.md`](../runbook.md) for escalation. +- Follow [`../security.md`](../security.md) for geolocation data protection controls. +- Use [`../troubleshooting.md`](../troubleshooting.md#query-and-index-issues) for issue resolution. + +## Rollout And Change Considerations + +- Document coordinate and unit assumptions in release notes when behavior changes. +- Validate backward compatibility for persisted spatial index pages. + +## Validation Guidance + +- Run `GeospatialTests` and `GeospatialStressTests` before release. +- Include representative proximity/bounding queries in smoke checks. diff --git a/docs/features/query-and-indexing.md b/docs/features/query-and-indexing.md new file mode 100644 index 0000000..8bbf044 --- /dev/null +++ b/docs/features/query-and-indexing.md @@ -0,0 +1,62 @@ +# Query And Indexing + +## Purpose And Business Outcome + +Deliver predictable query correctness and performance using expression translation with index-aware execution. + +## Scope And Non-Goals + +Scope: +- LINQ query translation +- Primary and secondary index usage +- Scan fallback where index optimization is not applicable + +Non-goals: +- Full SQL compatibility +- Distributed query federation + +## User And System Workflows + +1. Consumer submits LINQ expression. +2. Query planner evaluates index opportunities. +3. Engine executes index-backed or scan path. +4. Results are materialized to consumer entities. + +## Interfaces And APIs + +- `AsQueryable()` and query provider components +- `CollectionIndexManager` +- Index implementations under `src/CBDD.Core/Indexing` + +## Permissions And Data Handling + +- Query access follows host application authorization policy. +- Indexed data inherits the same sensitivity classification as source payloads. + +## Dependencies And Failure Modes + +Dependencies: +- Valid index metadata and storage consistency +- Expression visitor correctness + +Failure modes: +- Incorrect predicate translation +- Missing/ineffective indexes +- Performance regressions due to scan-heavy paths + +## Monitoring, Alerts, And Troubleshooting + +- Track query-related regressions in automated tests. +- Use [`../runbook.md`](../runbook.md) for incident handling. +- Follow [`../security.md`](../security.md) for sensitive data and access constraints. +- Use [`../troubleshooting.md`](../troubleshooting.md#query-and-index-issues) for remediation. + +## Rollout And Change Considerations + +- Query planner/index behavior changes require benchmark comparison and regression coverage. +- Document breaking semantics in release notes. + +## Validation Guidance + +- Run query, index, and optimizer test suites in `tests/CBDD.Tests`. +- Confirm coverage gate with `bash scripts/coverage-check.sh`. diff --git a/docs/features/source-generated-mapping.md b/docs/features/source-generated-mapping.md new file mode 100644 index 0000000..773cc01 --- /dev/null +++ b/docs/features/source-generated-mapping.md @@ -0,0 +1,61 @@ +# Source-Generated Mapping + +## Purpose And Business Outcome + +Generate compile-time mapping code to reduce runtime overhead and reflection risk for serialization paths. + +## Scope And Non-Goals + +Scope: +- Entity metadata analysis +- Mapper source generation +- Collection initialization helpers + +Non-goals: +- Runtime dynamic mapping for unknown schemas +- Support for unsupported C# patterns outside generator design + +## User And System Workflows + +1. Consumer defines entities and context patterns. +2. Build invokes source generator. +3. Generated mapper code is compiled into target project. +4. Runtime serialization path uses generated code. + +## Interfaces And APIs + +- Source generator project under `src/CBDD.SourceGenerators` +- Attributes in BSON and data-annotation mapping surface +- Generated initialization methods for context collections + +## Permissions And Data Handling + +- Generated code can expose field-level mapping behavior. +- Repository write permissions should be limited to trusted contributors. + +## Dependencies And Failure Modes + +Dependencies: +- Roslyn source generator execution during build +- Entity schema conventions + +Failure modes: +- Missing generation due to invalid entity declarations +- Serialization mismatch caused by attribute/model changes + +## Monitoring, Alerts, And Troubleshooting + +- Monitor build output for generator diagnostics. +- Use [`../runbook.md`](../runbook.md) for escalation. +- Follow [`../security.md`](../security.md) for review and control expectations. +- Use [`../troubleshooting.md`](../troubleshooting.md#source-generation-issues) for remediation steps. + +## Rollout And Change Considerations + +- Generator behavioral changes require broad regression tests across entities. +- Document any new constraints or unsupported patterns in release notes. + +## Validation Guidance + +- Run source generator and serialization tests in `tests/CBDD.Tests`. +- Validate mapper generation with clean `dotnet build` in CI. diff --git a/docs/features/storage-transactions.md b/docs/features/storage-transactions.md new file mode 100644 index 0000000..88a027b --- /dev/null +++ b/docs/features/storage-transactions.md @@ -0,0 +1,63 @@ +# Storage And Transactions + +## Purpose And Business Outcome + +Provide durable, ACID-compliant local persistence for embedded workloads that need consistent commit and recovery semantics. + +## Scope And Non-Goals + +Scope: +- Page-based storage +- Write-ahead logging +- Transaction lifecycle and commit/rollback semantics + +Non-goals: +- Distributed transactions +- Multi-node replication + +## User And System Workflows + +1. Application writes through `DocumentDbContext`. +2. Engine records WAL entries. +3. Commit persists pages and marks transaction durable. +4. Recovery replays WAL to restore committed state after restart. + +## Interfaces And APIs + +- `DocumentDbContext` +- `Transaction` and `ITransaction` +- `WriteAheadLog` +- Storage engine modules under `src/CBDD.Core/Storage` + +## Permissions And Data Handling + +- Database files require host-managed filesystem access controls. +- Transaction data should be treated as sensitive if payloads contain regulated information. + +## Dependencies And Failure Modes + +Dependencies: +- Local filesystem I/O +- WAL and page file consistency + +Failure modes: +- Interrupted writes +- Corrupted WAL entries +- Invalid page metadata after unsafe process termination + +## Monitoring, Alerts, And Troubleshooting + +- Use CI/test failures and incident issues as primary signals. +- Follow [`../runbook.md`](../runbook.md) for triage. +- Follow [`../security.md`](../security.md) for data handling and control requirements. +- Use [`../troubleshooting.md`](../troubleshooting.md#data-file-and-recovery-issues) for recovery issues. + +## Rollout And Change Considerations + +- Any storage format or WAL behavior change requires migration and rollback validation. +- Release notes must document backward compatibility impact. + +## Validation Guidance + +- Run transaction and recovery tests in `tests/CBDD.Tests`. +- Execute `dotnet test CBDD.slnx -c Release` before merge. diff --git a/docs/features/vector-search.md b/docs/features/vector-search.md new file mode 100644 index 0000000..6d2f3e5 --- /dev/null +++ b/docs/features/vector-search.md @@ -0,0 +1,60 @@ +# Vector Search + +## Purpose And Business Outcome + +Enable similarity search for embedding-driven workloads directly in embedded storage. + +## Scope And Non-Goals + +Scope: +- Vector index configuration +- Approximate nearest-neighbor query execution + +Non-goals: +- External model training +- Cross-database vector federation + +## User And System Workflows + +1. Consumer registers vector index for embedding field. +2. Documents persist embeddings in collection payloads. +3. Query issues vector search request with `k` nearest neighbors. +4. Engine returns ranked matches. + +## Interfaces And APIs + +- Vector index configuration via model builder +- Query extensions under `VectorSearchExtensions` +- Index implementation in `VectorSearchIndex` + +## Permissions And Data Handling + +- Embeddings may contain sensitive semantic information. +- Apply host-level access restrictions and retention controls. + +## Dependencies And Failure Modes + +Dependencies: +- Correct embedding dimensionality +- Index parameter tuning for workload + +Failure modes: +- Dimension mismatch between data and query vectors +- Poor recall due to incorrect index configuration + +## Monitoring, Alerts, And Troubleshooting + +- Validate vector query quality during release smoke checks. +- Use [`../runbook.md`](../runbook.md) for incident handling. +- Follow [`../security.md`](../security.md) for embedding-data handling controls. +- Use [`../troubleshooting.md`](../troubleshooting.md#query-and-index-issues) for vector query remediation. + +## Rollout And Change Considerations + +- Treat vector index parameter changes as performance-sensitive releases. +- Document compatibility impact for existing persisted indexes. + +## Validation Guidance + +- Run vector search tests in `tests/CBDD.Tests/VectorSearchTests.cs`. +- Add benchmark runs for large-vector workloads before release. diff --git a/docs/runbook.md b/docs/runbook.md new file mode 100644 index 0000000..bbe7220 --- /dev/null +++ b/docs/runbook.md @@ -0,0 +1,56 @@ +# Runbook + +## Purpose + +This runbook provides standard operations, incident triage, escalation, and recovery procedures for CBDD maintainers. + +## Signals And Entry Points + +- CI failures on `main` +- Failing integration tests in consumer repositories +- Regression issues labeled `incident` +- Recovery or corruption reports from consumers + +## Alert Triage Procedure + +1. Capture incident context: version, environment, failing operation, and first failure timestamp. +2. Classify severity: +- `SEV-1`: data loss risk, persistent startup failure, or transaction correctness risk. +- `SEV-2`: feature-level regression without confirmed data loss. +- `SEV-3`: non-critical behavior or documentation defects. +3. Create or update the incident issue with owner and current mitigation status. +4. Reproduce with targeted tests in `/Users/dohertj2/Desktop/CBDD/tests/CBDD.Tests`. + +## Diagnostics + +1. Validate build and tests. +```bash +dotnet test CBDD.slnx -c Release +``` +2. Run coverage threshold gate when behavior changed in core paths. +```bash +bash scripts/coverage-check.sh +``` +3. For storage and recovery incidents, prioritize: +- `StorageEngine.Recovery` +- `WriteAheadLog` +- transaction protocol tests + +## Escalation Path + +1. Initial owner: maintainer on incident issue. +2. Escalate to release maintainer when severity is `SEV-1` or rollback is required. +3. Communicate status updates on each milestone: triage complete, mitigation active, fix merged, validation complete. + +## Recovery Actions + +1. Contain impact by pinning consumers to last known-good package version. +2. Apply rollback steps from [`deployment.md`](deployment.md#rollback-procedure). +3. Validate repaired build with targeted and full regression suites. +4. Publish fixed package and confirm consumer recovery. + +## Post-Incident Expectations + +1. Document root cause, blast radius, and timeline. +2. Add regression tests to prevent recurrence. +3. Record follow-up actions in issue tracker with owners and due dates. diff --git a/docs/security.md b/docs/security.md new file mode 100644 index 0000000..cda85d0 --- /dev/null +++ b/docs/security.md @@ -0,0 +1,34 @@ +# Security + +## Scope + +CBDD is an embedded data engine. Security controls are shared between the library and the host application that embeds it. + +## Authentication And Authorization Model + +- CBDD does not provide built-in user authentication. +- Authorization is enforced by the host process and filesystem permissions. +- Access to database files must be limited to trusted service identities. + +## Data Classification And Handling + +- Treat persisted database files as sensitive when they contain customer or regulated data. +- Do not store secrets in source, fixtures, or benchmark datasets. +- Apply environment-specific retention and backup controls outside this repository. + +## Storage And Cryptography Controls + +- CBDD enforces integrity through WAL and transactional semantics. +- Encryption at rest and key management are host responsibilities. +- If encryption is required, use filesystem or volume-level encryption managed by platform security controls. + +## Secure Coding Expectations + +1. Require code review for storage, WAL, indexing, query, and serialization changes. +2. Add targeted tests for all security-relevant behavior changes. +3. Run package vulnerability checks in fitness pipeline. + +## Incident Handling + +- Follow [`runbook.md`](runbook.md) for incident triage and escalation. +- Label security-impacting issues with `security` and prioritize immediate containment. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 0000000..2317698 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,62 @@ +# Troubleshooting + +## Build And Test Failures + +### Symptom +`dotnet build` or `dotnet test` fails locally or in CI. + +### Checks +1. Verify .NET 10 SDK is installed. +2. Run `dotnet restore`. +3. Run `dotnet format --verify-no-changes`. +4. Re-run tests with `dotnet test CBDD.slnx -c Release`. + +### Resolution +- Fix reported compile/test failures before merge. +- For flaky tests, isolate affected test class and open an issue with reproduction details. + +## Data File And Recovery Issues + +### Symptom +Database startup fails or recovery path throws WAL/storage errors. + +### Checks +1. Capture exact exception and stack trace. +2. Reproduce with storage/recovery-focused tests. +3. Validate rollback path from [`deployment.md`](deployment.md#rollback-procedure). + +### Resolution +- Pin consumers to last known-good package. +- Apply fix and add regression coverage in recovery/transaction tests. + +## Query And Index Issues + +### Symptom +Unexpected query performance or incorrect query results. + +### Checks +1. Verify relevant indexes are configured for query predicates. +2. Reproduce with test cases in `tests/CBDD.Tests` for query/index modules. +3. Validate behavior for scan fallback and expression translation. + +### Resolution +- Add or adjust index definitions and query tests. +- Document any changed query semantics in release notes. + +## Source Generation Issues + +### Symptom +Generated mappers missing or serialization behavior differs from expectations. + +### Checks +1. Verify entity attributes and accessibility are valid for source generation. +2. Build solution to regenerate mapper output. +3. Validate related tests in source generator test coverage. + +### Resolution +- Update entity annotations or generator logic. +- Add focused regression tests for unsupported pattern handling. + +## Escalation + +If troubleshooting steps do not resolve the issue, follow incident escalation in [`runbook.md`](runbook.md). diff --git a/src/CBDD.Core/Collections/DocumentCollection.cs b/src/CBDD.Core/Collections/DocumentCollection.cs index d898682..e75159d 100755 --- a/src/CBDD.Core/Collections/DocumentCollection.cs +++ b/src/CBDD.Core/Collections/DocumentCollection.cs @@ -124,14 +124,28 @@ public partial class DocumentCollection : IDisposable where T : class _indexManager.SetPrimaryRootPageId(_primaryIndex.RootPageId); } - // Register keys used by the mapper to ensure they are available for compression - _storage.RegisterKeys(_mapper.UsedKeys); - } - - private void EnsureSchema() - { - var currentSchema = _mapper.GetSchema(); - var metadata = _indexManager.GetMetadata(); + // Register keys used by the mapper to ensure they are available for compression + _storage.RegisterKeys(_mapper.UsedKeys); + } + + private void RefreshPrimaryIndexRootFromMetadata() + { + _indexManager.RefreshFromStorageMetadata(); + + var primaryRootPageId = _indexManager.PrimaryRootPageId; + if (primaryRootPageId == 0) + return; + + if (primaryRootPageId != _primaryIndex.RootPageId) + { + _primaryIndex.SetRootPageId(primaryRootPageId); + } + } + + private void EnsureSchema() + { + var currentSchema = _mapper.GetSchema(); + var metadata = _indexManager.GetMetadata(); var persistedSchemas = _storage.GetSchemas(metadata.SchemaRootPageId); var latestPersisted = persistedSchemas.Count > 0 ? persistedSchemas[persistedSchemas.Count - 1] : null; @@ -363,12 +377,13 @@ public partial class DocumentCollection : IDisposable where T : class /// Rebuilds an index by scanning all existing documents and re-inserting them. /// Called automatically when creating a new index. /// - private void RebuildIndex(CollectionSecondaryIndex index) - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - // Iterate all documents in the collection via primary index - var minKey = new IndexKey(Array.Empty()); - var maxKey = new IndexKey(Enumerable.Repeat((byte)0xFF, 32).ToArray()); + private void RebuildIndex(CollectionSecondaryIndex index) + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + // Iterate all documents in the collection via primary index + var minKey = new IndexKey(Array.Empty()); + var maxKey = new IndexKey(Enumerable.Repeat((byte)0xFF, 32).ToArray()); foreach (var entry in _primaryIndex.Range(minKey, maxKey, IndexDirection.Forward, transaction.TransactionId)) { @@ -967,6 +982,7 @@ public partial class DocumentCollection : IDisposable where T : class private void InsertDataCore(TId id, T entity, ReadOnlySpan docData) { + RefreshPrimaryIndexRootFromMetadata(); var transaction = _transactionHolder.GetCurrentTransactionOrStart(); var (storedPayloadOverride, storedPayloadFlags) = PreparePayloadForStorage(docData); ReadOnlySpan storedPayload = storedPayloadOverride is null ? docData : storedPayloadOverride; @@ -1005,11 +1021,12 @@ public partial class DocumentCollection : IDisposable where T : class /// ObjectId of the document /// Optional transaction for isolation (supports Read Your Own Writes) /// The document, or null if not found - public T? FindById(TId id) - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - try - { + public T? FindById(TId id) + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + try + { var key = _mapper.ToIndexKey(id); if (!_primaryIndex.TryFind(key, out var location, transaction.TransactionId)) @@ -1031,11 +1048,12 @@ public partial class DocumentCollection : IDisposable where T : class /// /// Transaction for isolation (REQUIRED for consistent reads during concurrent writes) /// Enumerable of all documents - public IEnumerable FindAll() - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - var txnId = transaction?.TransactionId ?? 0; - var minKey = new IndexKey(Array.Empty()); + public IEnumerable FindAll() + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + var txnId = transaction?.TransactionId ?? 0; + var minKey = new IndexKey(Array.Empty()); var maxKey = new IndexKey(Enumerable.Repeat((byte)0xFF, 32).ToArray()); foreach (var entry in _primaryIndex.Range(minKey, maxKey, IndexDirection.Forward, txnId)) @@ -1202,11 +1220,12 @@ public partial class DocumentCollection : IDisposable where T : class } } - private int UpdateBulkInternal(List entityList) - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - int updateCount = 0; - const int BATCH_SIZE = 50; + private int UpdateBulkInternal(List entityList) + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + int updateCount = 0; + const int BATCH_SIZE = 50; for (int batchStart = 0; batchStart < entityList.Count; batchStart += BATCH_SIZE) { @@ -1272,6 +1291,7 @@ public partial class DocumentCollection : IDisposable where T : class private bool UpdateDataCore(TId id, T entity, ReadOnlySpan docData) { + RefreshPrimaryIndexRootFromMetadata(); var transaction = _transactionHolder.GetCurrentTransactionOrStart(); var key = _mapper.ToIndexKey(id); var (storedPayloadOverride, storedPayloadFlags) = PreparePayloadForStorage(docData); @@ -1438,11 +1458,12 @@ public partial class DocumentCollection : IDisposable where T : class return deleteCount; } - private bool DeleteCore(TId id, bool notifyCdc = true) - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - var key = _mapper.ToIndexKey(id); - if (!_primaryIndex.TryFind(key, out var location, transaction.TransactionId)) + private bool DeleteCore(TId id, bool notifyCdc = true) + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + var key = _mapper.ToIndexKey(id); + if (!_primaryIndex.TryFind(key, out var location, transaction.TransactionId)) return false; // Notify secondary indexes BEFORE deleting document from storage @@ -1524,11 +1545,12 @@ public partial class DocumentCollection : IDisposable where T : class /// /// Optional transaction for isolation /// Number of documents - public int Count() - { - var transaction = _transactionHolder.GetCurrentTransactionOrStart(); - // Count all entries in primary index - // Use generic min/max keys for the index + public int Count() + { + RefreshPrimaryIndexRootFromMetadata(); + var transaction = _transactionHolder.GetCurrentTransactionOrStart(); + // Count all entries in primary index + // Use generic min/max keys for the index var minKey = IndexKey.MinKey; var maxKey = IndexKey.MaxKey; return _primaryIndex.Range(minKey, maxKey, IndexDirection.Forward, transaction.TransactionId).Count(); diff --git a/src/CBDD.Core/Indexing/BTreeIndex.cs b/src/CBDD.Core/Indexing/BTreeIndex.cs index a6e0dac..ac155b0 100755 --- a/src/CBDD.Core/Indexing/BTreeIndex.cs +++ b/src/CBDD.Core/Indexing/BTreeIndex.cs @@ -82,6 +82,18 @@ public sealed class BTreeIndex /// Gets the current root page identifier for the B+Tree. /// public uint RootPageId => _rootPageId; + + /// + /// Updates the in-memory root page identifier. + /// + /// The root page identifier to use for subsequent operations. + internal void SetRootPageId(uint rootPageId) + { + if (rootPageId == 0) + throw new ArgumentOutOfRangeException(nameof(rootPageId)); + + _rootPageId = rootPageId; + } /// /// Reads a page using StorageEngine for transaction isolation. diff --git a/src/CBDD.Core/Indexing/CollectionIndexManager.cs b/src/CBDD.Core/Indexing/CollectionIndexManager.cs index a95ebe9..ebf0cec 100755 --- a/src/CBDD.Core/Indexing/CollectionIndexManager.cs +++ b/src/CBDD.Core/Indexing/CollectionIndexManager.cs @@ -504,6 +504,37 @@ public sealed class CollectionIndexManager : IDisposable where T : class /// public uint PrimaryRootPageId => _metadata.PrimaryRootPageId; + /// + /// Refreshes cached metadata and index root bindings from storage. + /// + internal void RefreshFromStorageMetadata() + { + lock (_lock) + { + if (_disposed) + throw new ObjectDisposedException(nameof(CollectionIndexManager)); + + var latest = _storage.GetCollectionMetadata(_collectionName) ?? new CollectionMetadata { Name = _collectionName }; + if (MetadataEquals(_metadata, latest)) + return; + + foreach (var index in _indexes.Values) + { + try { index.Dispose(); } catch { /* Best effort */ } + } + + _indexes.Clear(); + _metadata = latest; + + foreach (var idxMeta in _metadata.Indexes) + { + var definition = RebuildDefinition(idxMeta.Name, idxMeta.PropertyPaths, idxMeta.IsUnique, idxMeta.Type, idxMeta.Dimensions, idxMeta.Metric); + var index = new CollectionSecondaryIndex(definition, _storage, _mapper, idxMeta.RootPageId); + _indexes[idxMeta.Name] = index; + } + } + } + /// /// Sets the root page identifier for the primary index. /// @@ -526,11 +557,52 @@ public sealed class CollectionIndexManager : IDisposable where T : class /// The collection metadata. public CollectionMetadata GetMetadata() => _metadata; - private void SaveMetadata() - { - UpdateMetadata(); - _storage.SaveCollectionMetadata(_metadata); - } + private void SaveMetadata() + { + UpdateMetadata(); + _storage.SaveCollectionMetadata(_metadata); + } + + private static bool MetadataEquals(CollectionMetadata left, CollectionMetadata right) + { + if (!string.Equals(left.Name, right.Name, StringComparison.OrdinalIgnoreCase)) + return false; + + if (left.PrimaryRootPageId != right.PrimaryRootPageId || + left.SchemaRootPageId != right.SchemaRootPageId || + left.Indexes.Count != right.Indexes.Count) + { + return false; + } + + for (var i = 0; i < left.Indexes.Count; i++) + { + var l = left.Indexes[i]; + var r = right.Indexes[i]; + if (!string.Equals(l.Name, r.Name, StringComparison.OrdinalIgnoreCase) || + l.RootPageId != r.RootPageId || + l.Type != r.Type || + l.IsUnique != r.IsUnique || + l.Dimensions != r.Dimensions || + l.Metric != r.Metric) + { + return false; + } + + var lPaths = l.PropertyPaths ?? Array.Empty(); + var rPaths = r.PropertyPaths ?? Array.Empty(); + if (lPaths.Length != rPaths.Length) + return false; + + for (var p = 0; p < lPaths.Length; p++) + { + if (!string.Equals(lPaths[p], rPaths[p], StringComparison.Ordinal)) + return false; + } + } + + return true; + } /// /// Releases resources used by the index manager. diff --git a/src/CBDD.Core/Storage/PageFile.cs b/src/CBDD.Core/Storage/PageFile.cs index 81d0b2c..6a9a16b 100755 --- a/src/CBDD.Core/Storage/PageFile.cs +++ b/src/CBDD.Core/Storage/PageFile.cs @@ -645,6 +645,39 @@ public sealed class PageFile : IDisposable } } + /// + /// Trims excess physical file capacity beyond the current logical page count. + /// + /// The number of bytes removed from the file. + public long TrimExcessCapacityToLogicalPageCount() + { + lock (_lock) + { + EnsureFileOpen(); + + var targetLengthBytes = (long)_nextPageId * _config.PageSize; + var currentLengthBytes = _fileStream!.Length; + if (currentLengthBytes <= targetLengthBytes) + return 0; + + _mappedFile?.Dispose(); + _mappedFile = null; + + _fileStream.SetLength(targetLengthBytes); + _fileStream.Flush(flushToDisk: true); + + _mappedFile = MemoryMappedFile.CreateFromFile( + _fileStream, + null, + targetLengthBytes, + _config.Access, + HandleInheritability.None, + leaveOpen: true); + + return currentLengthBytes - targetLengthBytes; + } + } + /// /// Defragments a slotted page in place by packing live slot payloads densely at the end of the page. /// diff --git a/src/CBDD.Core/Storage/StorageEngine.Maintenance.cs b/src/CBDD.Core/Storage/StorageEngine.Maintenance.cs index 23f6768..e88a333 100644 --- a/src/CBDD.Core/Storage/StorageEngine.Maintenance.cs +++ b/src/CBDD.Core/Storage/StorageEngine.Maintenance.cs @@ -1,8 +1,11 @@ using System.Text; using System.Text.Json; using System.Text.Json.Serialization; +using System.Buffers.Binary; using System.IO.MemoryMappedFiles; using ZB.MOM.WW.CBDD.Core.Indexing; +using ZB.MOM.WW.CBDD.Core.Indexing.Internal; +using ZB.MOM.WW.CBDD.Core.Transactions; namespace ZB.MOM.WW.CBDD.Core.Storage; @@ -535,6 +538,7 @@ public sealed partial class StorageEngine { CheckpointInternal(); _pageFile.Flush(); + ValidateCollectionMetadataAndPrimaryIndexPointers(ct); EnsureCompactionArtifactDirectory(tempPath); EnsureCompactionArtifactDirectory(backupPath); @@ -554,12 +558,14 @@ public sealed partial class StorageEngine { if (!File.Exists(tempPath)) { + ValidateCollectionMetadataAndPrimaryIndexPointers(ct); (work, tailResult) = RunCopySwapCompactionOnTemporaryFile(tempPath, options, ct); } DeleteFileIfExists(backupPath); _pageFile.SnapshotToFile(backupPath); _pageFile.ReplaceFromFile(tempPath); + ReloadDictionaryStateAfterCompactionSwap(); try { @@ -568,6 +574,7 @@ public sealed partial class StorageEngine catch { _pageFile.ReplaceFromFile(backupPath); + ReloadDictionaryStateAfterCompactionSwap(); throw; } @@ -1010,70 +1017,938 @@ public sealed partial class StorageEngine CompactionOptions options, CancellationToken ct) { - // Snapshot current storage first, then run maintenance operations on the temp copy. DeleteFileIfExists(tempPath); DeleteFileIfExists(GetCompactionTempWalPath(tempPath)); - _pageFile.SnapshotToFile(tempPath); - return RunPhysicalCompactionOnTemporaryFile(tempPath, options, ct); - } - - private (CompactionWork Work, TailTruncationResult TailResult) RunPhysicalCompactionOnTemporaryFile( - string tempPath, - CompactionOptions options, - CancellationToken ct) - { - var tempConfig = _pageFile.Config; - if (tempConfig.Access != MemoryMappedFileAccess.ReadWrite) - { - tempConfig = new PageFileConfig - { - PageSize = tempConfig.PageSize, - InitialFileSize = tempConfig.InitialFileSize, - Access = MemoryMappedFileAccess.ReadWrite - }; - } - - using var tempPageFile = new PageFile(tempPath, tempConfig); - tempPageFile.Open(); - var work = new CompactionWork(); - if (options.NormalizeFreeList) - { - work.FreeListPagesNormalized = tempPageFile.NormalizeFreeList(includeEmptyPages: true); - } + TailTruncationResult tailResult = default; - if (options.DefragmentSlottedPages) + var logicalSourceSizeBytes = Math.Max((long)_pageFile.NextPageId * _pageFile.PageSize, _pageFile.PageSize * 2L); + var tempConfig = CreateWritableCompactionPageFileConfig(_pageFile.Config, logicalSourceSizeBytes); + var tempMaintenance = CreateCompactionTempMaintenanceOptions(); + using var tempEngine = new StorageEngine( + tempPath, + tempConfig, + _compressionOptions, + tempMaintenance); + + tempEngine.WriteStorageFormatMetadata(_storageFormatMetadata); + _ = tempEngine._pageFile.NormalizeFreeList(includeEmptyPages: true); + CopyDictionaryMappingDeterministically(tempEngine, ct); + + var sourceCollections = GetAllCollectionMetadata(); + + var relocatedSourcePages = new HashSet(); + + foreach (var sourceMetadata in sourceCollections) { - var pageCount = tempPageFile.NextPageId; - for (uint pageId = 1; pageId < pageCount; pageId++) + ct.ThrowIfCancellationRequested(); + var collectionResult = RebuildCollectionForCompaction(tempEngine, sourceMetadata, ct); + tempEngine.SaveCollectionMetadata(collectionResult.Metadata); + + work.DocumentsRelocated += collectionResult.DocumentsRelocated; + foreach (var relocatedPageId in collectionResult.RelocatedSourcePageIds) { - ct.ThrowIfCancellationRequested(); - var result = tempPageFile.DefragmentSlottedPageWithStats(pageId); - if (result.Changed) - { - work.PagesDefragmented++; - work.BytesReclaimedByDefragmentation += result.ReclaimedBytes; - if (result.RelocatedSlotCount > 0) - { - work.DocumentsRelocated += result.RelocatedSlotCount; - work.PagesRelocated++; - } - } - - work.PagesScanned++; + relocatedSourcePages.Add(relocatedPageId); } } - TailTruncationResult tailResult = default; - if (options.EnableTailTruncation) + work.PagesRelocated = relocatedSourcePages.Count; + work.PagesScanned = Math.Max(0, (int)Math.Max(0, _pageFile.NextPageId - 1)); + + if (options.NormalizeFreeList) { - tailResult = tempPageFile.TruncateReclaimableTailPages(options.MinimumRetainedPages); + work.FreeListPagesNormalized = tempEngine._pageFile.NormalizeFreeList(includeEmptyPages: true); } - tempPageFile.Flush(); + if (options.EnableTailTruncation) + { + tailResult = tempEngine._pageFile.TruncateReclaimableTailPages(options.MinimumRetainedPages); + } + + _ = tempEngine._pageFile.TrimExcessCapacityToLogicalPageCount(); + tempEngine.CheckpointInternal(); + tempEngine._pageFile.Flush(); return (work, tailResult); } + private static PageFileConfig CreateWritableCompactionPageFileConfig(PageFileConfig currentConfig, long sourceFileSize) + { + var minimumFileSize = Math.Max(currentConfig.PageSize * 2L, 0L); + var initialFileSize = Math.Max(sourceFileSize, minimumFileSize); + + if (currentConfig.Access == MemoryMappedFileAccess.ReadWrite) + { + return new PageFileConfig + { + PageSize = currentConfig.PageSize, + InitialFileSize = initialFileSize, + Access = currentConfig.Access + }; + } + + return new PageFileConfig + { + PageSize = currentConfig.PageSize, + InitialFileSize = initialFileSize, + Access = MemoryMappedFileAccess.ReadWrite + }; + } + + private MaintenanceOptions CreateCompactionTempMaintenanceOptions() + { + return new MaintenanceOptions + { + RunAtStartup = false, + MinFragmentationPercent = _maintenanceOptions.MinFragmentationPercent, + MinReclaimableBytes = _maintenanceOptions.MinReclaimableBytes, + MaxRunDuration = _maintenanceOptions.MaxRunDuration, + OnlineThrottlePagesPerBatch = _maintenanceOptions.OnlineThrottlePagesPerBatch, + OnlineThrottleDelay = _maintenanceOptions.OnlineThrottleDelay + }; + } + + private readonly struct CompactionCollectionRebuildResult + { + public CompactionCollectionRebuildResult( + CollectionMetadata metadata, + long documentsRelocated, + IReadOnlyCollection relocatedSourcePageIds) + { + Metadata = metadata; + DocumentsRelocated = documentsRelocated; + RelocatedSourcePageIds = relocatedSourcePageIds; + } + + public CollectionMetadata Metadata { get; } + public long DocumentsRelocated { get; } + public IReadOnlyCollection RelocatedSourcePageIds { get; } + } + + private sealed class CompactionDataWriterState + { + public Dictionary FreeSpaceByPage { get; } = new(); + public uint CurrentDataPageId { get; set; } + } + + private readonly struct CompactionVectorNode + { + public CompactionVectorNode(uint pageId, int nodeIndex, DocumentLocation location, float[] vector) + { + PageId = pageId; + NodeIndex = nodeIndex; + Location = location; + Vector = vector; + } + + public uint PageId { get; } + public int NodeIndex { get; } + public DocumentLocation Location { get; } + public float[] Vector { get; } + } + + private CompactionCollectionRebuildResult RebuildCollectionForCompaction( + StorageEngine tempEngine, + CollectionMetadata sourceMetadata, + CancellationToken ct) + { + var targetMetadata = new CollectionMetadata + { + Name = sourceMetadata.Name + }; + + targetMetadata.SchemaRootPageId = CopySchemaChainForCompaction(tempEngine, sourceMetadata.SchemaRootPageId, ct); + + if (sourceMetadata.PrimaryRootPageId == 0) + { + foreach (var sourceIndexMetadata in sourceMetadata.Indexes) + { + if (sourceIndexMetadata.RootPageId != 0) + { + throw new InvalidDataException( + $"Compaction validation failed for collection '{sourceMetadata.Name}': " + + $"secondary index '{sourceIndexMetadata.Name}' has a root but primary index root is missing."); + } + + targetMetadata.Indexes.Add(CloneIndexMetadata(sourceIndexMetadata, rootPageId: 0)); + } + + return new CompactionCollectionRebuildResult(targetMetadata, 0, Array.Empty()); + } + + var dataWriter = new CompactionDataWriterState(); + var relocatedSourcePages = new HashSet(); + var locationRemap = new Dictionary(); + + var resolvedPrimaryRoot = ResolveCurrentBTreeRootPageId( + sourceMetadata.PrimaryRootPageId, + sourceMetadata.Name, + "primary index"); + var sourcePrimary = new BTreeIndex(this, IndexOptions.CreateUnique("_id"), resolvedPrimaryRoot); + var targetPrimary = CreateCompactionBTreeAtRoot( + tempEngine, + IndexOptions.CreateUnique("_id"), + rootPageId: 0); + targetMetadata.PrimaryRootPageId = targetPrimary.RootPageId; + + long relocatedDocuments = 0; + using (var transaction = tempEngine.BeginTransaction()) + { + foreach (var primaryEntry in sourcePrimary.Range( + IndexKey.MinKey, + IndexKey.MaxKey, + IndexDirection.Forward, + transactionId: 0)) + { + ct.ThrowIfCancellationRequested(); + + if (!TryReadStoredPayload(primaryEntry.Location, out var storedPayload, out var isCompressed)) + { + throw new InvalidDataException( + $"Compaction rebuild failed for collection '{sourceMetadata.Name}': " + + $"cannot read source payload at (page {primaryEntry.Location.PageId}, slot {primaryEntry.Location.SlotIndex})."); + } + + var rebuiltLocation = InsertStoredPayloadForCompaction(tempEngine, storedPayload, isCompressed, dataWriter); + targetPrimary.Insert(primaryEntry.Key, rebuiltLocation, transaction.TransactionId); + locationRemap[primaryEntry.Location] = rebuiltLocation; + relocatedSourcePages.Add(primaryEntry.Location.PageId); + relocatedDocuments++; + } + + foreach (var sourceIndexMetadata in sourceMetadata.Indexes) + { + ct.ThrowIfCancellationRequested(); + var rebuiltIndexMetadata = RebuildSecondaryIndexForCompaction( + tempEngine, + sourceMetadata.Name, + sourceIndexMetadata, + locationRemap, + transaction, + ct); + targetMetadata.Indexes.Add(rebuiltIndexMetadata); + } + + transaction.Commit(); + } + + tempEngine.CheckpointInternal(); + tempEngine._pageFile.Flush(); + + return new CompactionCollectionRebuildResult(targetMetadata, relocatedDocuments, relocatedSourcePages.ToArray()); + } + + private static IndexMetadata CloneIndexMetadata(IndexMetadata source, uint rootPageId) + { + return new IndexMetadata + { + Name = source.Name, + IsUnique = source.IsUnique, + Type = source.Type, + PropertyPaths = source.PropertyPaths.ToArray(), + Dimensions = source.Dimensions, + Metric = source.Metric, + RootPageId = rootPageId + }; + } + + private uint CopySchemaChainForCompaction(StorageEngine tempEngine, uint sourceSchemaRootPageId, CancellationToken ct) + { + if (sourceSchemaRootPageId == 0) + return 0; + + var schemas = GetSchemas(sourceSchemaRootPageId); + uint rebuiltSchemaRoot = 0; + foreach (var schema in schemas) + { + ct.ThrowIfCancellationRequested(); + rebuiltSchemaRoot = tempEngine.AppendSchema(rebuiltSchemaRoot, schema); + } + + return rebuiltSchemaRoot; + } + + private IndexMetadata RebuildSecondaryIndexForCompaction( + StorageEngine tempEngine, + string collectionName, + IndexMetadata sourceIndexMetadata, + IReadOnlyDictionary locationRemap, + Transaction tempTransaction, + CancellationToken ct) + { + if (sourceIndexMetadata.RootPageId == 0) + return CloneIndexMetadata(sourceIndexMetadata, rootPageId: 0); + + switch (sourceIndexMetadata.Type) + { + case IndexType.BTree: + case IndexType.Unique: + { + var sourceOptions = BuildIndexOptionsForCompaction(sourceIndexMetadata); + var resolvedSourceRoot = ResolveCurrentBTreeRootPageId( + sourceIndexMetadata.RootPageId, + collectionName, + $"index '{sourceIndexMetadata.Name}'"); + var sourceIndex = new BTreeIndex(this, sourceOptions, resolvedSourceRoot); + var rebuiltIndex = CreateCompactionBTreeAtRoot( + tempEngine, + sourceOptions, + rootPageId: 0); + + foreach (var entry in sourceIndex.Range( + IndexKey.MinKey, + IndexKey.MaxKey, + IndexDirection.Forward, + transactionId: 0)) + { + ct.ThrowIfCancellationRequested(); + var mapped = RemapLocationOrThrow( + locationRemap, + entry.Location, + collectionName, + sourceIndexMetadata.Name); + rebuiltIndex.Insert(entry.Key, mapped, tempTransaction.TransactionId); + } + + return CloneIndexMetadata(sourceIndexMetadata, rebuiltIndex.RootPageId); + } + case IndexType.Spatial: + { + var options = BuildIndexOptionsForCompaction(sourceIndexMetadata); + using var rebuiltIndex = new RTreeIndex(tempEngine, options, rootPageId: 0); + foreach (var spatialEntry in EnumerateSpatialLeafEntriesForCompaction(sourceIndexMetadata.RootPageId, ct)) + { + var mapped = RemapLocationOrThrow( + locationRemap, + spatialEntry.Location, + collectionName, + sourceIndexMetadata.Name); + rebuiltIndex.Insert(spatialEntry.Mbr, mapped, tempTransaction); + } + + return CloneIndexMetadata(sourceIndexMetadata, rebuiltIndex.RootPageId); + } + case IndexType.Vector: + { + var options = ResolveVectorIndexOptionsForCompaction(sourceIndexMetadata); + var rebuiltIndex = new VectorSearchIndex(tempEngine, options, rootPageId: 0); + foreach (var vectorNode in EnumerateVectorNodesForCompaction(sourceIndexMetadata.RootPageId, options, ct)) + { + var mapped = RemapLocationOrThrow( + locationRemap, + vectorNode.Location, + collectionName, + sourceIndexMetadata.Name); + rebuiltIndex.Insert(vectorNode.Vector, mapped, tempTransaction); + } + + return CloneIndexMetadata(sourceIndexMetadata, rebuiltIndex.RootPageId); + } + case IndexType.Hash: + throw new InvalidDataException( + $"Compaction rebuild failed for collection '{collectionName}', index '{sourceIndexMetadata.Name}': " + + "hash index rebuild is not supported by logical compaction."); + default: + throw new InvalidDataException( + $"Compaction rebuild failed for collection '{collectionName}', index '{sourceIndexMetadata.Name}': " + + $"unsupported index type '{sourceIndexMetadata.Type}'."); + } + } + + private static DocumentLocation RemapLocationOrThrow( + IReadOnlyDictionary locationRemap, + in DocumentLocation sourceLocation, + string collectionName, + string indexName) + { + if (locationRemap.TryGetValue(sourceLocation, out var mapped)) + return mapped; + + throw new InvalidDataException( + $"Compaction rebuild failed for collection '{collectionName}', index '{indexName}': " + + $"source location (page {sourceLocation.PageId}, slot {sourceLocation.SlotIndex}) was not found in the rebuilt primary index."); + } + + private uint ResolveCurrentBTreeRootPageId(uint configuredRootPageId, string collectionName, string rootLabel) + { + if (configuredRootPageId == 0) + return 0; + + var current = configuredRootPageId; + var visited = new HashSet(); + var pageBuffer = new byte[_pageFile.PageSize]; + + while (visited.Add(current)) + { + if (current >= _pageFile.NextPageId) + { + throw new InvalidDataException( + $"Compaction validation failed for collection '{collectionName}': " + + $"{rootLabel} root page id {current} is out of range."); + } + + _pageFile.ReadPage(current, pageBuffer); + var pageHeader = PageHeader.ReadFrom(pageBuffer); + if (pageHeader.PageType != PageType.Index) + { + throw new InvalidDataException( + $"Compaction validation failed for collection '{collectionName}': " + + $"{rootLabel} root page id {current} has type {pageHeader.PageType}, expected {PageType.Index}."); + } + + var nodeHeader = BTreeNodeHeader.ReadFrom(pageBuffer.AsSpan(32)); + if (nodeHeader.ParentPageId == 0) + return current; + + current = nodeHeader.ParentPageId; + } + + throw new InvalidDataException( + $"Compaction validation failed for collection '{collectionName}': " + + $"{rootLabel} root page chain contains a cycle."); + } + + private static void EnsureRootPagesAllocatedForCompaction(StorageEngine target, IEnumerable rootPageIds) + { + var maxRootPageId = rootPageIds + .Where(pageId => pageId > 0) + .DefaultIfEmpty(0u) + .Max(); + + while (maxRootPageId > 0 && target._pageFile.NextPageId <= maxRootPageId) + { + target.AllocatePage(); + } + } + + private static BTreeIndex CreateCompactionBTreeAtRoot(StorageEngine target, IndexOptions options, uint rootPageId) + { + if (rootPageId == 0) + return new BTreeIndex(target, options, rootPageId: 0); + + if (target._pageFile.NextPageId <= rootPageId) + { + EnsureRootPagesAllocatedForCompaction(target, new[] { rootPageId }); + } + + InitializeBTreeRootPageForCompaction(target, rootPageId); + return new BTreeIndex(target, options, rootPageId); + } + + private static void InitializeBTreeRootPageForCompaction(StorageEngine target, uint rootPageId) + { + var pageBuffer = new byte[target.PageSize]; + var pageHeader = new PageHeader + { + PageId = rootPageId, + PageType = PageType.Index, + FreeBytes = (ushort)(target.PageSize - 32), + NextPageId = 0, + TransactionId = 0, + Checksum = 0 + }; + pageHeader.WriteTo(pageBuffer); + + var nodeHeader = new BTreeNodeHeader + { + PageId = rootPageId, + IsLeaf = true, + EntryCount = 0, + ParentPageId = 0, + NextLeafPageId = 0, + PrevLeafPageId = 0 + }; + nodeHeader.WriteTo(pageBuffer.AsSpan(32)); + target.WritePageImmediate(rootPageId, pageBuffer); + } + + private static IndexOptions BuildIndexOptionsForCompaction(IndexMetadata metadata) + { + var fields = metadata.PropertyPaths ?? Array.Empty(); + return metadata.Type switch + { + IndexType.Unique => IndexOptions.CreateUnique(fields), + IndexType.Spatial => IndexOptions.CreateSpatial(fields), + IndexType.Vector => IndexOptions.CreateVector( + dimensions: Math.Max(1, metadata.Dimensions), + metric: metadata.Metric, + m: 16, + ef: 200, + fields), + _ => metadata.IsUnique ? IndexOptions.CreateUnique(fields) : IndexOptions.CreateBTree(fields) + }; + } + + private IndexOptions ResolveVectorIndexOptionsForCompaction(IndexMetadata metadata) + { + var dimensions = metadata.Dimensions; + var maxM = 16; + + if (TryReadVectorPageLayout(metadata.RootPageId, out var pageDimensions, out var pageMaxM)) + { + if (pageDimensions > 0) + dimensions = pageDimensions; + if (pageMaxM > 0) + maxM = pageMaxM; + } + + if (dimensions <= 0) + { + throw new InvalidDataException( + $"Compaction rebuild failed for index '{metadata.Name}': vector dimensions are invalid."); + } + + return IndexOptions.CreateVector( + dimensions, + metric: metadata.Metric, + m: maxM, + ef: 200, + metadata.PropertyPaths ?? Array.Empty()); + } + + private bool TryReadVectorPageLayout(uint rootPageId, out int dimensions, out int maxM) + { + dimensions = 0; + maxM = 0; + if (rootPageId == 0 || rootPageId >= _pageFile.NextPageId) + return false; + + var buffer = new byte[_pageFile.PageSize]; + _pageFile.ReadPage(rootPageId, buffer); + var pageHeader = PageHeader.ReadFrom(buffer); + if (pageHeader.PageType != PageType.Vector) + return false; + + dimensions = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(32, 4)); + maxM = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(36, 4)); + return dimensions > 0 && maxM > 0; + } + + private IEnumerable<(GeoBox Mbr, DocumentLocation Location)> EnumerateSpatialLeafEntriesForCompaction( + uint rootPageId, + CancellationToken ct) + { + if (rootPageId == 0) + yield break; + + var pending = new SortedSet { rootPageId }; + var visitedPages = new HashSet(); + var pageBuffer = new byte[_pageFile.PageSize]; + + while (pending.Count > 0) + { + ct.ThrowIfCancellationRequested(); + var pageId = pending.Min; + pending.Remove(pageId); + + if (!visitedPages.Add(pageId)) + continue; + + if (pageId >= _pageFile.NextPageId) + { + throw new InvalidDataException( + $"Compaction rebuild failed: spatial page id {pageId} is out of range."); + } + + _pageFile.ReadPage(pageId, pageBuffer); + var pageHeader = PageHeader.ReadFrom(pageBuffer); + if (pageHeader.PageType != PageType.Spatial) + { + throw new InvalidDataException( + $"Compaction rebuild failed: page {pageId} in spatial index chain is not a spatial page."); + } + + var isLeaf = SpatialPage.GetIsLeaf(pageBuffer); + var entryCount = SpatialPage.GetEntryCount(pageBuffer); + + for (int i = 0; i < entryCount; i++) + { + SpatialPage.ReadEntry(pageBuffer, i, out var mbr, out var pointer); + if (isLeaf) + { + yield return (mbr, pointer); + } + else + { + if (pointer.PageId == 0 || pointer.PageId >= _pageFile.NextPageId) + { + throw new InvalidDataException( + $"Compaction rebuild failed: spatial child pointer page {pointer.PageId} is invalid."); + } + + if (!visitedPages.Contains(pointer.PageId)) + { + pending.Add(pointer.PageId); + } + } + } + } + } + + private IEnumerable EnumerateVectorNodesForCompaction( + uint rootPageId, + IndexOptions vectorOptions, + CancellationToken ct) + { + if (rootPageId == 0) + yield break; + + var dimensions = vectorOptions.Dimensions; + var maxM = Math.Max(1, vectorOptions.M); + var pending = new SortedSet { rootPageId }; + var visitedPages = new HashSet(); + var pageBuffer = new byte[_pageFile.PageSize]; + + while (pending.Count > 0) + { + ct.ThrowIfCancellationRequested(); + var pageId = pending.Min; + pending.Remove(pageId); + + if (!visitedPages.Add(pageId)) + continue; + + if (pageId >= _pageFile.NextPageId) + { + throw new InvalidDataException( + $"Compaction rebuild failed: vector page id {pageId} is out of range."); + } + + _pageFile.ReadPage(pageId, pageBuffer); + var pageHeader = PageHeader.ReadFrom(pageBuffer); + if (pageHeader.PageType != PageType.Vector) + { + throw new InvalidDataException( + $"Compaction rebuild failed: page {pageId} in vector index chain is not a vector page."); + } + + var nodeCount = VectorPage.GetNodeCount(pageBuffer); + if (nodeCount < 0 || nodeCount > VectorPage.GetMaxNodes(pageBuffer)) + { + throw new InvalidDataException( + $"Compaction rebuild failed: vector page {pageId} has an invalid node count ({nodeCount})."); + } + + for (var nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) + { + var vector = new float[dimensions]; + VectorPage.ReadNodeData(pageBuffer, nodeIndex, out var location, out var maxLevel, vector); + yield return new CompactionVectorNode(pageId, nodeIndex, location, vector); + + var levelUpperBound = Math.Min(15, Math.Max(0, maxLevel)); + for (var level = 0; level <= levelUpperBound; level++) + { + var links = VectorPage.GetLinksSpan(pageBuffer, nodeIndex, level, dimensions, maxM); + for (var offset = 0; offset + DocumentLocation.SerializedSize <= links.Length; offset += DocumentLocation.SerializedSize) + { + var link = DocumentLocation.ReadFrom(links.Slice(offset, DocumentLocation.SerializedSize)); + if (link.PageId == 0) + break; + + if (link.PageId >= _pageFile.NextPageId) + { + throw new InvalidDataException( + $"Compaction rebuild failed: vector link points to out-of-range page {link.PageId}."); + } + + if (!visitedPages.Contains(link.PageId)) + { + pending.Add(link.PageId); + } + } + } + } + } + } + + private static DocumentLocation InsertStoredPayloadForCompaction( + StorageEngine target, + ReadOnlySpan storedPayload, + bool isCompressed, + CompactionDataWriterState writerState) + { + var baseFlags = isCompressed ? SlotFlags.Compressed : SlotFlags.None; + var maxSinglePagePayload = target.PageSize - SlottedPageHeader.Size - SlotEntry.Size; + if (storedPayload.Length + SlotEntry.Size <= maxSinglePagePayload) + { + var pageId = FindDataPageWithSpaceForCompaction(target, writerState, storedPayload.Length + SlotEntry.Size); + if (pageId == 0) + { + pageId = AllocateDataPageForCompaction(target, writerState); + } + + var slotIndex = InsertPayloadIntoDataPageForCompaction(target, pageId, storedPayload, baseFlags, writerState); + return new DocumentLocation(pageId, slotIndex); + } + + const int overflowMetadataSize = 8; + var primaryChunkSize = maxSinglePagePayload - overflowMetadataSize; + if (primaryChunkSize <= 0) + { + throw new InvalidOperationException("Compaction rebuild cannot allocate overflow payload on current page size."); + } + + uint nextOverflowPageId = 0; + var overflowChunkSize = target.PageSize - SlottedPageHeader.Size; + var overflowBytes = storedPayload.Length - primaryChunkSize; + var fullPages = overflowBytes / overflowChunkSize; + var tailBytes = overflowBytes % overflowChunkSize; + + if (tailBytes > 0) + { + var tailOffset = primaryChunkSize + (fullPages * overflowChunkSize); + nextOverflowPageId = AllocateOverflowPageForCompaction(target, storedPayload.Slice(tailOffset, tailBytes), nextOverflowPageId); + } + + for (var i = fullPages - 1; i >= 0; i--) + { + var chunkOffset = primaryChunkSize + (i * overflowChunkSize); + nextOverflowPageId = AllocateOverflowPageForCompaction( + target, + storedPayload.Slice(chunkOffset, overflowChunkSize), + nextOverflowPageId); + } + + var primaryPayload = new byte[overflowMetadataSize + primaryChunkSize]; + BinaryPrimitives.WriteInt32LittleEndian(primaryPayload.AsSpan(0, 4), storedPayload.Length); + BinaryPrimitives.WriteUInt32LittleEndian(primaryPayload.AsSpan(4, 4), nextOverflowPageId); + storedPayload.Slice(0, primaryChunkSize).CopyTo(primaryPayload.AsSpan(8)); + + var primaryPageId = FindDataPageWithSpaceForCompaction(target, writerState, primaryPayload.Length + SlotEntry.Size); + if (primaryPageId == 0) + { + primaryPageId = AllocateDataPageForCompaction(target, writerState); + } + + var slotFlags = baseFlags | SlotFlags.HasOverflow; + var primarySlotIndex = InsertPayloadIntoDataPageForCompaction(target, primaryPageId, primaryPayload, slotFlags, writerState); + return new DocumentLocation(primaryPageId, primarySlotIndex); + } + + private static uint AllocateOverflowPageForCompaction(StorageEngine target, ReadOnlySpan payloadChunk, uint nextOverflowPageId) + { + var overflowPageId = target.AllocatePage(); + var pageBuffer = new byte[target.PageSize]; + var header = new SlottedPageHeader + { + PageId = overflowPageId, + PageType = PageType.Overflow, + SlotCount = 0, + FreeSpaceStart = SlottedPageHeader.Size, + FreeSpaceEnd = (ushort)target.PageSize, + NextOverflowPage = nextOverflowPageId, + TransactionId = 0 + }; + header.WriteTo(pageBuffer); + payloadChunk.CopyTo(pageBuffer.AsSpan(SlottedPageHeader.Size)); + target.WritePageImmediate(overflowPageId, pageBuffer); + return overflowPageId; + } + + private static uint FindDataPageWithSpaceForCompaction(StorageEngine target, CompactionDataWriterState writerState, int requiredBytes) + { + if (writerState.CurrentDataPageId != 0 && + writerState.FreeSpaceByPage.TryGetValue(writerState.CurrentDataPageId, out var currentFree) && + currentFree >= requiredBytes) + { + return writerState.CurrentDataPageId; + } + + foreach (var (pageId, freeBytes) in writerState.FreeSpaceByPage) + { + if (freeBytes >= requiredBytes) + return pageId; + } + + return 0; + } + + private static uint AllocateDataPageForCompaction(StorageEngine target, CompactionDataWriterState writerState) + { + var pageId = target.AllocatePage(); + var pageBuffer = new byte[target.PageSize]; + var header = new SlottedPageHeader + { + PageId = pageId, + PageType = PageType.Data, + SlotCount = 0, + FreeSpaceStart = SlottedPageHeader.Size, + FreeSpaceEnd = (ushort)target.PageSize, + NextOverflowPage = 0, + TransactionId = 0 + }; + header.WriteTo(pageBuffer); + target.WritePageImmediate(pageId, pageBuffer); + writerState.FreeSpaceByPage[pageId] = (ushort)header.AvailableFreeSpace; + writerState.CurrentDataPageId = pageId; + return pageId; + } + + private static ushort InsertPayloadIntoDataPageForCompaction( + StorageEngine target, + uint pageId, + ReadOnlySpan payload, + SlotFlags slotFlags, + CompactionDataWriterState writerState) + { + var pageBuffer = new byte[target.PageSize]; + target.ReadPage(pageId, transactionId: null, pageBuffer); + var header = SlottedPageHeader.ReadFrom(pageBuffer); + + if (header.PageType == PageType.Empty && header.FreeSpaceEnd == 0) + { + header = new SlottedPageHeader + { + PageId = pageId, + PageType = PageType.Data, + SlotCount = 0, + FreeSpaceStart = SlottedPageHeader.Size, + FreeSpaceEnd = (ushort)target.PageSize, + NextOverflowPage = 0, + TransactionId = 0 + }; + header.WriteTo(pageBuffer); + } + + var requiredSpace = payload.Length + SlotEntry.Size; + if (header.AvailableFreeSpace < requiredSpace) + { + throw new InvalidOperationException( + $"Compaction rebuild failed: not enough free space on page {pageId} (required {requiredSpace}, available {header.AvailableFreeSpace})."); + } + + var slotIndex = FindReusableSlotForCompaction(pageBuffer, ref header); + var payloadOffset = header.FreeSpaceEnd - payload.Length; + payload.CopyTo(pageBuffer.AsSpan(payloadOffset, payload.Length)); + + var slotOffset = SlottedPageHeader.Size + (slotIndex * SlotEntry.Size); + var slot = new SlotEntry + { + Offset = (ushort)payloadOffset, + Length = (ushort)payload.Length, + Flags = slotFlags + }; + slot.WriteTo(pageBuffer.AsSpan(slotOffset, SlotEntry.Size)); + + if (slotIndex >= header.SlotCount) + header.SlotCount = (ushort)(slotIndex + 1); + + header.FreeSpaceStart = (ushort)(SlottedPageHeader.Size + (header.SlotCount * SlotEntry.Size)); + header.FreeSpaceEnd = (ushort)payloadOffset; + header.WriteTo(pageBuffer); + + target.WritePageImmediate(pageId, pageBuffer); + writerState.FreeSpaceByPage[pageId] = (ushort)header.AvailableFreeSpace; + writerState.CurrentDataPageId = pageId; + return slotIndex; + } + + private static ushort FindReusableSlotForCompaction(Span pageBuffer, ref SlottedPageHeader header) + { + for (ushort i = 0; i < header.SlotCount; i++) + { + var slotOffset = SlottedPageHeader.Size + (i * SlotEntry.Size); + var slot = SlotEntry.ReadFrom(pageBuffer.Slice(slotOffset, SlotEntry.Size)); + if ((slot.Flags & SlotFlags.Deleted) != 0) + return i; + } + + return header.SlotCount; + } + + private void CopyDictionaryMappingDeterministically(StorageEngine target, CancellationToken ct) + { + var sourceEntries = _dictionaryReverseCache + .OrderBy(entry => entry.Key) + .ToArray(); + + ReinitializeDictionaryForCompaction(target, ct); + + ushort maxId = DictionaryPage.ReservedValuesEnd; + foreach (var entry in sourceEntries) + { + ct.ThrowIfCancellationRequested(); + var key = entry.Value.ToLowerInvariant(); + var id = entry.Key; + if (!target.InsertDictionaryEntryGlobal(key, id)) + { + throw new InvalidOperationException( + $"Compaction rebuild failed: unable to copy dictionary entry '{key}' with id {id}."); + } + + target._dictionaryCache[key] = id; + target._dictionaryReverseCache[id] = key; + if (id > maxId) + { + maxId = id; + } + } + + target._nextDictionaryId = (ushort)(maxId + 1); + } + + private static void ReinitializeDictionaryForCompaction(StorageEngine target, CancellationToken ct) + { + var chainPages = EnumerateDictionaryChainPages(target, target._dictionaryRootPageId, ct); + foreach (var pageId in chainPages) + { + target.FreePage(pageId); + } + + var newRootPageId = target.AllocatePage(); + var dictionaryRootBuffer = new byte[target.PageSize]; + DictionaryPage.Initialize(dictionaryRootBuffer, newRootPageId); + target.WritePageImmediate(newRootPageId, dictionaryRootBuffer); + + var fileHeaderBuffer = new byte[target.PageSize]; + target.ReadPage(0, transactionId: null, fileHeaderBuffer); + var fileHeader = PageHeader.ReadFrom(fileHeaderBuffer); + fileHeader.DictionaryRootPageId = newRootPageId; + fileHeader.WriteTo(fileHeaderBuffer); + target.WritePageImmediate(0, fileHeaderBuffer); + + target._dictionaryCache.Clear(); + target._dictionaryReverseCache.Clear(); + target._dictionaryRootPageId = newRootPageId; + } + + private static IReadOnlyList EnumerateDictionaryChainPages(StorageEngine target, uint rootPageId, CancellationToken ct) + { + var result = new List(); + if (rootPageId == 0) + return result; + + var visited = new HashSet(); + var pageBuffer = new byte[target.PageSize]; + var current = rootPageId; + + while (current != 0 && visited.Add(current)) + { + ct.ThrowIfCancellationRequested(); + if (current >= target._pageFile.NextPageId) + break; + + result.Add(current); + target._pageFile.ReadPage(current, pageBuffer); + var header = PageHeader.ReadFrom(pageBuffer); + current = header.NextPageId; + } + + return result; + } + + private void ReloadDictionaryStateAfterCompactionSwap() + { + lock (_dictionaryLock) + { + _dictionaryCache.Clear(); + _dictionaryReverseCache.Clear(); + _dictionaryRootPageId = 0; + _nextDictionaryId = DictionaryPage.ReservedValuesEnd + 1; + } + + InitializeDictionary(); + } + private bool TryReadCompactionMarker(string markerPath, out CompactionMarkerState? markerState) { markerState = null;