Initial import of the CBDDC codebase with docs and tests. Add a .NET-focused gitignore to keep generated artifacts out of source control.

2026-02-20 13:03:21 -05:00
commit 08bfc17218
218 changed files with 33910 additions and 0 deletions
--- a/docs/production-hardening.md
+++ b/docs/production-hardening.md
@@ -0,0 +1,267 @@
+# Production Hardening - Implementation Guide
+
+## Quick Reference
+
+### Configuration (appsettings.json)
+```json
+{
+  "CBDDC": {
+    "Network": {
+      "TcpPort": 5000,
+      "UdpPort": 6000,
+      "RetryAttempts": 3
+    },
+    "Persistence": {
+      "DatabasePath": "data/cbddc.db",
+      "EnableWalMode": true,
+      "CacheSizeMb": 50,
+      "EnableAutoBackup": true,
+      "BackupPath": "backups/"
+    },
+    "Sync": {
+      "EnableOfflineQueue": true,
+      "MaxQueueSize": 1000
+    }
+  }
+}
+```
+
+### DI Setup
+```csharp
+services.Configure<CBDDCOptions>(configuration.GetSection("CBDDC"));
+services.AddSingleton<RetryPolicy>();
+services.AddSingleton<DocumentCache>();
+services.AddSingleton<OfflineQueue>();
+services.AddSingleton<SyncStatusTracker>();
+services.AddCBDDCHosting(options =>
+{
+    options.Cluster.NodeId = "server-01";
+    options.Cluster.TcpPort = 5001;
+    options.Cluster.PeerConfirmationLagThresholdMs = 30_000;
+    options.Cluster.PeerConfirmationCriticalLagThresholdMs = 120_000;
+});
+```
+
+### Health Check
+```csharp
+app.MapHealthChecks("/health");
+```
+
+### Peer Confirmation Lag Thresholds
+
+`CBDDCHealthCheck` evaluates tracked peers using confirmation lag thresholds:
+
+- `PeerConfirmationLagThresholdMs` (default `30000`) marks peers as lagging and
+  returns `Degraded`.
+- `PeerConfirmationCriticalLagThresholdMs` (default `120000`) marks critical lag and
+  returns `Unhealthy`.
+
+Thresholds are clamped so critical is never lower than lag.
+
+### Health Status Interpretation
+
+For the `cbddc` health check:
+
+| Status | Meaning | Typical operator action |
+|------|-----------|-------------|
+| Healthy | All active tracked peers have confirmations and lag is within threshold. | No action required. |
+| Degraded | At least one tracked peer is lagging, or at least one tracked peer has no confirmation rows yet. | Investigate slow/unconfirmed peers, confirm whether any should be untracked/deprecated. |
+| Unhealthy | At least one tracked peer exceeds critical lag threshold, or persistence check throws. | Page on-call, verify storage/network path, and evaluate emergency peer de-tracking for permanently retired peers. |
+
+Health payload fields:
+
+- `trackedPeerCount`
+- `peersWithNoConfirmation`
+- `maxLagMs`
+- `laggingPeers`
+- `lastSuccessfulConfirmationUpdateByPeer`
+
+Use these fields to distinguish temporary lag from stale peer registrations.
+
+### Offline Queue
+```csharp
+// Enqueue during offline
+if (!isOnline)
+{
+    offlineQueue.Enqueue(new PendingOperation 
+    { 
+        Type = "put", 
+        Collection = "users", 
+        Key = "user1",
+        Data = user 
+    });
+}
+
+// Flush when back online
+var (successful, failed) = await offlineQueue.FlushAsync(async op => 
+{
+    var collection = database.Collection(op.Collection);
+    if (op.Type == "put" && op.Data != null)
+        await collection.Put(op.Key, op.Data);
+    else if (op.Type == "delete")
+        await collection.Delete(op.Key);
+});
+```
+
+### Document Cache
+```csharp
+var cache = new DocumentCache(maxSizeMb: 50);
+
+// Check cache first
+var cached = cache.Get("users", "user1");
+if (cached != null) return cached;
+
+// Load from database
+var doc = await store.GetDocumentAsync("users", "user1");
+if (doc != null) cache.Set("users", "user1", doc);
+```
+
+### SQLite Backup
+```csharp
+await store.BackupAsync("backups/backup-20260115.db");
+```
+
+### Retry Policy
+```csharp
+var retry = new RetryPolicy(logger, maxAttempts: 3, delayMs: 1000);
+
+await retry.ExecuteAsync(
+    () => tcpClient.ConnectAsync(endpoint),
+    "TCP Connect"
+);
+```
+
+### Error Handling
+
+Use specific exceptions for robust control flow:
+
+```csharp
+try
+{
+    await operation();
+}
+catch (DocumentNotFoundException ex)
+{
+    // Handle specific document missing case
+    logger.LogWarning("Document {Key} missing", ex.Key);
+}
+catch (CBDDCConcurrencyException ex)
+{
+    // Handle conflict (though LWW usually resolves it automatically)
+    logger.LogWarning("Concurrency conflict: {Message}", ex.Message);
+}
+catch (NetworkException ex)
+{
+    logger.LogError(ex, "Network operation failed");
+    syncTracker.RecordError(ex.Message, peerNodeId, ex.ErrorCode);
+}
+catch (PersistenceException ex) when (ex is DatabaseCorruptionException)
+{
+    logger.LogCritical(ex, "Database corruption detected!");
+    // Attempt recovery or alert admin
+}
+```
+
+## Error Codes
+
+| Code | Exception | Description |
+|------|-----------|-------------|
+| NETWORK_ERROR | NetworkException | Network operation failed |
+| PERSISTENCE_ERROR | PersistenceException | Database operation failed |
+| SYNC_ERROR | SyncException | Synchronization failed |
+| CONFIG_ERROR | ConfigurationException | Invalid configuration |
+| TIMEOUT_ERROR | TimeoutException | Operation timed out |
+
+## Logging Levels
+
+- **Trace**: Internal details (cache hits/misses)
+- **Debug**: Debugging info (sync operations)
+- **Information**: Normal events (peer discovered, backup created)
+- **Warning**: Recoverable errors (queue full, retry attempt, documents not found)
+- **Error**: Failures requiring attention (sync failed, corruption detected)
+- **Critical**: System failures (database initialization failed)
+
+## Best Practices
+
+1. **Always use structured logging**
+   ```csharp
+   _logger.LogInformation("User {UserId} synced {Count} documents", userId, count);
+   ```
+
+2. **Wrap network operations with retry policy**
+   ```csharp
+   await _retryPolicy.ExecuteAsync(() => client.ConnectAsync(), "Connect");
+   ```
+
+3. **Check cache before database**
+   ```csharp
+   var doc = _cache.Get(collection, key) ?? await _store.GetDocumentAsync(collection, key);
+   ```
+
+4. **Enable offline queue for LAN instability**
+   ```csharp
+   if (options.Sync.EnableOfflineQueue && !isOnline)
+       _offlineQueue.Enqueue(operation);
+   ```
+
+5. **Periodic health checks**
+   ```csharp
+   var timer = new Timer(async _ =>
+   {
+       var report = await healthCheckService.CheckHealthAsync(r => r.Name == "cbddc");
+       var entry = report.Entries["cbddc"];
+       if (entry.Status != HealthStatus.Healthy)
+       {
+           _logger.LogWarning(
+               "CBDDC health is {Status}. LaggingPeers={LaggingPeers} UnconfirmedPeers={UnconfirmedPeers}",
+               entry.Status,
+               entry.Data["laggingPeers"],
+               entry.Data["peersWithNoConfirmation"]);
+       }
+   }, null, TimeSpan.Zero, TimeSpan.FromMinutes(5));
+   ```
+
+## Deployment Checklist
+
+- [ ] Configuration file created (appsettings.json)
+- [ ] Log directory permissions set
+- [ ] Backup directory configured
+- [ ] Database file location specified
+- [ ] Network ports configured (firewall)
+- [ ] Health check endpoint tested
+- [ ] Offline queue tested
+- [ ] Backup/restore tested
+- [ ] Graceful shutdown tested
+
+## Troubleshooting
+
+### Database corruption
+```csharp
+try
+{
+    await store.CheckIntegrityAsync();
+}
+catch (DatabaseCorruptionException)
+{
+    // Restore from backup
+    File.Copy("backups/latest.db", options.Persistence.DatabasePath, overwrite: true);
+}
+```
+
+### Network issues
+```
+Check sync tracker:
+- Last sync time
+- Active peers
+- Recent errors
+```
+
+### Performance degradation
+```csharp
+var stats = cache.GetStatistics();
+if (stats.HitRate < 0.5)
+{
+    // Consider increasing cache size
+    options.Persistence.CacheSizeMb = 100;
+}
+```