Initial import of the CBDDC codebase with docs and tests. Add a .NET-focused gitignore to keep generated artifacts out of source control.
Some checks failed
CI / verify (push) Has been cancelled
Some checks failed
CI / verify (push) Has been cancelled
This commit is contained in:
267
docs/production-hardening.md
Executable file
267
docs/production-hardening.md
Executable file
@@ -0,0 +1,267 @@
|
||||
# Production Hardening - Implementation Guide
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### Configuration (appsettings.json)
|
||||
```json
|
||||
{
|
||||
"CBDDC": {
|
||||
"Network": {
|
||||
"TcpPort": 5000,
|
||||
"UdpPort": 6000,
|
||||
"RetryAttempts": 3
|
||||
},
|
||||
"Persistence": {
|
||||
"DatabasePath": "data/cbddc.db",
|
||||
"EnableWalMode": true,
|
||||
"CacheSizeMb": 50,
|
||||
"EnableAutoBackup": true,
|
||||
"BackupPath": "backups/"
|
||||
},
|
||||
"Sync": {
|
||||
"EnableOfflineQueue": true,
|
||||
"MaxQueueSize": 1000
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### DI Setup
|
||||
```csharp
|
||||
services.Configure<CBDDCOptions>(configuration.GetSection("CBDDC"));
|
||||
services.AddSingleton<RetryPolicy>();
|
||||
services.AddSingleton<DocumentCache>();
|
||||
services.AddSingleton<OfflineQueue>();
|
||||
services.AddSingleton<SyncStatusTracker>();
|
||||
services.AddCBDDCHosting(options =>
|
||||
{
|
||||
options.Cluster.NodeId = "server-01";
|
||||
options.Cluster.TcpPort = 5001;
|
||||
options.Cluster.PeerConfirmationLagThresholdMs = 30_000;
|
||||
options.Cluster.PeerConfirmationCriticalLagThresholdMs = 120_000;
|
||||
});
|
||||
```
|
||||
|
||||
### Health Check
|
||||
```csharp
|
||||
app.MapHealthChecks("/health");
|
||||
```
|
||||
|
||||
### Peer Confirmation Lag Thresholds
|
||||
|
||||
`CBDDCHealthCheck` evaluates tracked peers using confirmation lag thresholds:
|
||||
|
||||
- `PeerConfirmationLagThresholdMs` (default `30000`) marks peers as lagging and
|
||||
returns `Degraded`.
|
||||
- `PeerConfirmationCriticalLagThresholdMs` (default `120000`) marks critical lag and
|
||||
returns `Unhealthy`.
|
||||
|
||||
Thresholds are clamped so critical is never lower than lag.
|
||||
|
||||
### Health Status Interpretation
|
||||
|
||||
For the `cbddc` health check:
|
||||
|
||||
| Status | Meaning | Typical operator action |
|
||||
|------|-----------|-------------|
|
||||
| Healthy | All active tracked peers have confirmations and lag is within threshold. | No action required. |
|
||||
| Degraded | At least one tracked peer is lagging, or at least one tracked peer has no confirmation rows yet. | Investigate slow/unconfirmed peers, confirm whether any should be untracked/deprecated. |
|
||||
| Unhealthy | At least one tracked peer exceeds critical lag threshold, or persistence check throws. | Page on-call, verify storage/network path, and evaluate emergency peer de-tracking for permanently retired peers. |
|
||||
|
||||
Health payload fields:
|
||||
|
||||
- `trackedPeerCount`
|
||||
- `peersWithNoConfirmation`
|
||||
- `maxLagMs`
|
||||
- `laggingPeers`
|
||||
- `lastSuccessfulConfirmationUpdateByPeer`
|
||||
|
||||
Use these fields to distinguish temporary lag from stale peer registrations.
|
||||
|
||||
### Offline Queue
|
||||
```csharp
|
||||
// Enqueue during offline
|
||||
if (!isOnline)
|
||||
{
|
||||
offlineQueue.Enqueue(new PendingOperation
|
||||
{
|
||||
Type = "put",
|
||||
Collection = "users",
|
||||
Key = "user1",
|
||||
Data = user
|
||||
});
|
||||
}
|
||||
|
||||
// Flush when back online
|
||||
var (successful, failed) = await offlineQueue.FlushAsync(async op =>
|
||||
{
|
||||
var collection = database.Collection(op.Collection);
|
||||
if (op.Type == "put" && op.Data != null)
|
||||
await collection.Put(op.Key, op.Data);
|
||||
else if (op.Type == "delete")
|
||||
await collection.Delete(op.Key);
|
||||
});
|
||||
```
|
||||
|
||||
### Document Cache
|
||||
```csharp
|
||||
var cache = new DocumentCache(maxSizeMb: 50);
|
||||
|
||||
// Check cache first
|
||||
var cached = cache.Get("users", "user1");
|
||||
if (cached != null) return cached;
|
||||
|
||||
// Load from database
|
||||
var doc = await store.GetDocumentAsync("users", "user1");
|
||||
if (doc != null) cache.Set("users", "user1", doc);
|
||||
```
|
||||
|
||||
### SQLite Backup
|
||||
```csharp
|
||||
await store.BackupAsync("backups/backup-20260115.db");
|
||||
```
|
||||
|
||||
### Retry Policy
|
||||
```csharp
|
||||
var retry = new RetryPolicy(logger, maxAttempts: 3, delayMs: 1000);
|
||||
|
||||
await retry.ExecuteAsync(
|
||||
() => tcpClient.ConnectAsync(endpoint),
|
||||
"TCP Connect"
|
||||
);
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
Use specific exceptions for robust control flow:
|
||||
|
||||
```csharp
|
||||
try
|
||||
{
|
||||
await operation();
|
||||
}
|
||||
catch (DocumentNotFoundException ex)
|
||||
{
|
||||
// Handle specific document missing case
|
||||
logger.LogWarning("Document {Key} missing", ex.Key);
|
||||
}
|
||||
catch (CBDDCConcurrencyException ex)
|
||||
{
|
||||
// Handle conflict (though LWW usually resolves it automatically)
|
||||
logger.LogWarning("Concurrency conflict: {Message}", ex.Message);
|
||||
}
|
||||
catch (NetworkException ex)
|
||||
{
|
||||
logger.LogError(ex, "Network operation failed");
|
||||
syncTracker.RecordError(ex.Message, peerNodeId, ex.ErrorCode);
|
||||
}
|
||||
catch (PersistenceException ex) when (ex is DatabaseCorruptionException)
|
||||
{
|
||||
logger.LogCritical(ex, "Database corruption detected!");
|
||||
// Attempt recovery or alert admin
|
||||
}
|
||||
```
|
||||
|
||||
## Error Codes
|
||||
|
||||
| Code | Exception | Description |
|
||||
|------|-----------|-------------|
|
||||
| NETWORK_ERROR | NetworkException | Network operation failed |
|
||||
| PERSISTENCE_ERROR | PersistenceException | Database operation failed |
|
||||
| SYNC_ERROR | SyncException | Synchronization failed |
|
||||
| CONFIG_ERROR | ConfigurationException | Invalid configuration |
|
||||
| TIMEOUT_ERROR | TimeoutException | Operation timed out |
|
||||
|
||||
## Logging Levels
|
||||
|
||||
- **Trace**: Internal details (cache hits/misses)
|
||||
- **Debug**: Debugging info (sync operations)
|
||||
- **Information**: Normal events (peer discovered, backup created)
|
||||
- **Warning**: Recoverable errors (queue full, retry attempt, documents not found)
|
||||
- **Error**: Failures requiring attention (sync failed, corruption detected)
|
||||
- **Critical**: System failures (database initialization failed)
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always use structured logging**
|
||||
```csharp
|
||||
_logger.LogInformation("User {UserId} synced {Count} documents", userId, count);
|
||||
```
|
||||
|
||||
2. **Wrap network operations with retry policy**
|
||||
```csharp
|
||||
await _retryPolicy.ExecuteAsync(() => client.ConnectAsync(), "Connect");
|
||||
```
|
||||
|
||||
3. **Check cache before database**
|
||||
```csharp
|
||||
var doc = _cache.Get(collection, key) ?? await _store.GetDocumentAsync(collection, key);
|
||||
```
|
||||
|
||||
4. **Enable offline queue for LAN instability**
|
||||
```csharp
|
||||
if (options.Sync.EnableOfflineQueue && !isOnline)
|
||||
_offlineQueue.Enqueue(operation);
|
||||
```
|
||||
|
||||
5. **Periodic health checks**
|
||||
```csharp
|
||||
var timer = new Timer(async _ =>
|
||||
{
|
||||
var report = await healthCheckService.CheckHealthAsync(r => r.Name == "cbddc");
|
||||
var entry = report.Entries["cbddc"];
|
||||
if (entry.Status != HealthStatus.Healthy)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"CBDDC health is {Status}. LaggingPeers={LaggingPeers} UnconfirmedPeers={UnconfirmedPeers}",
|
||||
entry.Status,
|
||||
entry.Data["laggingPeers"],
|
||||
entry.Data["peersWithNoConfirmation"]);
|
||||
}
|
||||
}, null, TimeSpan.Zero, TimeSpan.FromMinutes(5));
|
||||
```
|
||||
|
||||
## Deployment Checklist
|
||||
|
||||
- [ ] Configuration file created (appsettings.json)
|
||||
- [ ] Log directory permissions set
|
||||
- [ ] Backup directory configured
|
||||
- [ ] Database file location specified
|
||||
- [ ] Network ports configured (firewall)
|
||||
- [ ] Health check endpoint tested
|
||||
- [ ] Offline queue tested
|
||||
- [ ] Backup/restore tested
|
||||
- [ ] Graceful shutdown tested
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Database corruption
|
||||
```csharp
|
||||
try
|
||||
{
|
||||
await store.CheckIntegrityAsync();
|
||||
}
|
||||
catch (DatabaseCorruptionException)
|
||||
{
|
||||
// Restore from backup
|
||||
File.Copy("backups/latest.db", options.Persistence.DatabasePath, overwrite: true);
|
||||
}
|
||||
```
|
||||
|
||||
### Network issues
|
||||
```
|
||||
Check sync tracker:
|
||||
- Last sync time
|
||||
- Active peers
|
||||
- Recent errors
|
||||
```
|
||||
|
||||
### Performance degradation
|
||||
```csharp
|
||||
var stats = cache.GetStatistics();
|
||||
if (stats.HitRate < 0.5)
|
||||
{
|
||||
// Consider increasing cache size
|
||||
options.Persistence.CacheSizeMb = 100;
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user