268 lines
7.2 KiB
Markdown
Executable File
268 lines
7.2 KiB
Markdown
Executable File
# Production Hardening - Implementation Guide
|
|
|
|
## Quick Reference
|
|
|
|
### Configuration (appsettings.json)
|
|
```json
|
|
{
|
|
"CBDDC": {
|
|
"Network": {
|
|
"TcpPort": 5000,
|
|
"UdpPort": 6000,
|
|
"RetryAttempts": 3
|
|
},
|
|
"Persistence": {
|
|
"DatabasePath": "data/cbddc.db",
|
|
"EnableWalMode": true,
|
|
"CacheSizeMb": 50,
|
|
"EnableAutoBackup": true,
|
|
"BackupPath": "backups/"
|
|
},
|
|
"Sync": {
|
|
"EnableOfflineQueue": true,
|
|
"MaxQueueSize": 1000
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### DI Setup
|
|
```csharp
|
|
services.Configure<CBDDCOptions>(configuration.GetSection("CBDDC"));
|
|
services.AddSingleton<RetryPolicy>();
|
|
services.AddSingleton<DocumentCache>();
|
|
services.AddSingleton<OfflineQueue>();
|
|
services.AddSingleton<SyncStatusTracker>();
|
|
services.AddCBDDCHosting(options =>
|
|
{
|
|
options.Cluster.NodeId = "server-01";
|
|
options.Cluster.TcpPort = 5001;
|
|
options.Cluster.PeerConfirmationLagThresholdMs = 30_000;
|
|
options.Cluster.PeerConfirmationCriticalLagThresholdMs = 120_000;
|
|
});
|
|
```
|
|
|
|
### Health Check
|
|
```csharp
|
|
app.MapHealthChecks("/health");
|
|
```
|
|
|
|
### Peer Confirmation Lag Thresholds
|
|
|
|
`CBDDCHealthCheck` evaluates tracked peers using confirmation lag thresholds:
|
|
|
|
- `PeerConfirmationLagThresholdMs` (default `30000`) marks peers as lagging and
|
|
returns `Degraded`.
|
|
- `PeerConfirmationCriticalLagThresholdMs` (default `120000`) marks critical lag and
|
|
returns `Unhealthy`.
|
|
|
|
Thresholds are clamped so critical is never lower than lag.
|
|
|
|
### Health Status Interpretation
|
|
|
|
For the `cbddc` health check:
|
|
|
|
| Status | Meaning | Typical operator action |
|
|
|------|-----------|-------------|
|
|
| Healthy | All active tracked peers have confirmations and lag is within threshold. | No action required. |
|
|
| Degraded | At least one tracked peer is lagging, or at least one tracked peer has no confirmation rows yet. | Investigate slow/unconfirmed peers, confirm whether any should be untracked/deprecated. |
|
|
| Unhealthy | At least one tracked peer exceeds critical lag threshold, or persistence check throws. | Page on-call, verify storage/network path, and evaluate emergency peer de-tracking for permanently retired peers. |
|
|
|
|
Health payload fields:
|
|
|
|
- `trackedPeerCount`
|
|
- `peersWithNoConfirmation`
|
|
- `maxLagMs`
|
|
- `laggingPeers`
|
|
- `lastSuccessfulConfirmationUpdateByPeer`
|
|
|
|
Use these fields to distinguish temporary lag from stale peer registrations.
|
|
|
|
### Offline Queue
|
|
```csharp
|
|
// Enqueue during offline
|
|
if (!isOnline)
|
|
{
|
|
offlineQueue.Enqueue(new PendingOperation
|
|
{
|
|
Type = "put",
|
|
Collection = "users",
|
|
Key = "user1",
|
|
Data = user
|
|
});
|
|
}
|
|
|
|
// Flush when back online
|
|
var (successful, failed) = await offlineQueue.FlushAsync(async op =>
|
|
{
|
|
var collection = database.Collection(op.Collection);
|
|
if (op.Type == "put" && op.Data != null)
|
|
await collection.Put(op.Key, op.Data);
|
|
else if (op.Type == "delete")
|
|
await collection.Delete(op.Key);
|
|
});
|
|
```
|
|
|
|
### Document Cache
|
|
```csharp
|
|
var cache = new DocumentCache(maxSizeMb: 50);
|
|
|
|
// Check cache first
|
|
var cached = cache.Get("users", "user1");
|
|
if (cached != null) return cached;
|
|
|
|
// Load from database
|
|
var doc = await store.GetDocumentAsync("users", "user1");
|
|
if (doc != null) cache.Set("users", "user1", doc);
|
|
```
|
|
|
|
### SQLite Backup
|
|
```csharp
|
|
await store.BackupAsync("backups/backup-20260115.db");
|
|
```
|
|
|
|
### Retry Policy
|
|
```csharp
|
|
var retry = new RetryPolicy(logger, maxAttempts: 3, delayMs: 1000);
|
|
|
|
await retry.ExecuteAsync(
|
|
() => tcpClient.ConnectAsync(endpoint),
|
|
"TCP Connect"
|
|
);
|
|
```
|
|
|
|
### Error Handling
|
|
|
|
Use specific exceptions for robust control flow:
|
|
|
|
```csharp
|
|
try
|
|
{
|
|
await operation();
|
|
}
|
|
catch (DocumentNotFoundException ex)
|
|
{
|
|
// Handle specific document missing case
|
|
logger.LogWarning("Document {Key} missing", ex.Key);
|
|
}
|
|
catch (CBDDCConcurrencyException ex)
|
|
{
|
|
// Handle conflict (though LWW usually resolves it automatically)
|
|
logger.LogWarning("Concurrency conflict: {Message}", ex.Message);
|
|
}
|
|
catch (NetworkException ex)
|
|
{
|
|
logger.LogError(ex, "Network operation failed");
|
|
syncTracker.RecordError(ex.Message, peerNodeId, ex.ErrorCode);
|
|
}
|
|
catch (PersistenceException ex) when (ex is DatabaseCorruptionException)
|
|
{
|
|
logger.LogCritical(ex, "Database corruption detected!");
|
|
// Attempt recovery or alert admin
|
|
}
|
|
```
|
|
|
|
## Error Codes
|
|
|
|
| Code | Exception | Description |
|
|
|------|-----------|-------------|
|
|
| NETWORK_ERROR | NetworkException | Network operation failed |
|
|
| PERSISTENCE_ERROR | PersistenceException | Database operation failed |
|
|
| SYNC_ERROR | SyncException | Synchronization failed |
|
|
| CONFIG_ERROR | ConfigurationException | Invalid configuration |
|
|
| TIMEOUT_ERROR | TimeoutException | Operation timed out |
|
|
|
|
## Logging Levels
|
|
|
|
- **Trace**: Internal details (cache hits/misses)
|
|
- **Debug**: Debugging info (sync operations)
|
|
- **Information**: Normal events (peer discovered, backup created)
|
|
- **Warning**: Recoverable errors (queue full, retry attempt, documents not found)
|
|
- **Error**: Failures requiring attention (sync failed, corruption detected)
|
|
- **Critical**: System failures (database initialization failed)
|
|
|
|
## Best Practices
|
|
|
|
1. **Always use structured logging**
|
|
```csharp
|
|
_logger.LogInformation("User {UserId} synced {Count} documents", userId, count);
|
|
```
|
|
|
|
2. **Wrap network operations with retry policy**
|
|
```csharp
|
|
await _retryPolicy.ExecuteAsync(() => client.ConnectAsync(), "Connect");
|
|
```
|
|
|
|
3. **Check cache before database**
|
|
```csharp
|
|
var doc = _cache.Get(collection, key) ?? await _store.GetDocumentAsync(collection, key);
|
|
```
|
|
|
|
4. **Enable offline queue for LAN instability**
|
|
```csharp
|
|
if (options.Sync.EnableOfflineQueue && !isOnline)
|
|
_offlineQueue.Enqueue(operation);
|
|
```
|
|
|
|
5. **Periodic health checks**
|
|
```csharp
|
|
var timer = new Timer(async _ =>
|
|
{
|
|
var report = await healthCheckService.CheckHealthAsync(r => r.Name == "cbddc");
|
|
var entry = report.Entries["cbddc"];
|
|
if (entry.Status != HealthStatus.Healthy)
|
|
{
|
|
_logger.LogWarning(
|
|
"CBDDC health is {Status}. LaggingPeers={LaggingPeers} UnconfirmedPeers={UnconfirmedPeers}",
|
|
entry.Status,
|
|
entry.Data["laggingPeers"],
|
|
entry.Data["peersWithNoConfirmation"]);
|
|
}
|
|
}, null, TimeSpan.Zero, TimeSpan.FromMinutes(5));
|
|
```
|
|
|
|
## Deployment Checklist
|
|
|
|
- [ ] Configuration file created (appsettings.json)
|
|
- [ ] Log directory permissions set
|
|
- [ ] Backup directory configured
|
|
- [ ] Database file location specified
|
|
- [ ] Network ports configured (firewall)
|
|
- [ ] Health check endpoint tested
|
|
- [ ] Offline queue tested
|
|
- [ ] Backup/restore tested
|
|
- [ ] Graceful shutdown tested
|
|
|
|
## Troubleshooting
|
|
|
|
### Database corruption
|
|
```csharp
|
|
try
|
|
{
|
|
await store.CheckIntegrityAsync();
|
|
}
|
|
catch (DatabaseCorruptionException)
|
|
{
|
|
// Restore from backup
|
|
File.Copy("backups/latest.db", options.Persistence.DatabasePath, overwrite: true);
|
|
}
|
|
```
|
|
|
|
### Network issues
|
|
```
|
|
Check sync tracker:
|
|
- Last sync time
|
|
- Active peers
|
|
- Recent errors
|
|
```
|
|
|
|
### Performance degradation
|
|
```csharp
|
|
var stats = cache.GetStatistics();
|
|
if (stats.HitRate < 0.5)
|
|
{
|
|
// Consider increasing cache size
|
|
options.Persistence.CacheSizeMb = 100;
|
|
}
|
|
```
|