b92fea15d4
- Configuration-004: NodePermissions stored as int to match the EF HasConversion<int>() in OtOpcUaConfigDbContext.ConfigureNodeAcl. - Configuration-005: serialise LiteDbConfigCache.PutAsync so concurrent Put for the same (ClusterId, GenerationId) cannot duplicate rows. - Configuration-007: rethrow OperationCanceledException from GenerationApplier.ApplyPass when the caller's token is cancelled. - Configuration-010: scrub secrets and drop the full exception object from the ResilientConfigReader fallback warning log. - Configuration-011: pin the previously-uncovered GenerationApplier cancellation and path-length / publish-validation paths. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
127 lines
6.1 KiB
C#
127 lines
6.1 KiB
C#
using System.Text.RegularExpressions;
|
||
using Microsoft.Extensions.Logging;
|
||
using Polly;
|
||
using Polly.Retry;
|
||
using Polly.Timeout;
|
||
|
||
namespace ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
|
||
|
||
/// <summary>
|
||
/// Wraps a central-DB fetch function with Phase 6.1 Stream D.2 resilience:
|
||
/// <b>timeout 2 s → retry 3× jittered → fallback to sealed cache</b>. Maintains the
|
||
/// <see cref="StaleConfigFlag"/> — fresh on central-DB success, stale on cache fallback.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// <para>Read-path only per plan. The write path (draft save, publish) bypasses this
|
||
/// wrapper entirely and fails hard on DB outage so inconsistent writes never land.</para>
|
||
///
|
||
/// <para>Fallback is triggered by <b>any exception</b> the fetch raises (central-DB
|
||
/// unreachable, SqlException, timeout). If the sealed cache also fails (no pointer,
|
||
/// corrupt file, etc.), <see cref="GenerationCacheUnavailableException"/> surfaces — caller
|
||
/// must fail the current request (InitializeAsync for a driver, etc.).</para>
|
||
/// </remarks>
|
||
public sealed class ResilientConfigReader
|
||
{
|
||
private readonly GenerationSealedCache _cache;
|
||
private readonly StaleConfigFlag _staleFlag;
|
||
private readonly ResiliencePipeline _pipeline;
|
||
private readonly ILogger<ResilientConfigReader> _logger;
|
||
|
||
public ResilientConfigReader(
|
||
GenerationSealedCache cache,
|
||
StaleConfigFlag staleFlag,
|
||
ILogger<ResilientConfigReader> logger,
|
||
TimeSpan? timeout = null,
|
||
int retryCount = 3)
|
||
{
|
||
_cache = cache;
|
||
_staleFlag = staleFlag;
|
||
_logger = logger;
|
||
var builder = new ResiliencePipelineBuilder()
|
||
.AddTimeout(new TimeoutStrategyOptions { Timeout = timeout ?? TimeSpan.FromSeconds(2) });
|
||
|
||
if (retryCount > 0)
|
||
{
|
||
builder.AddRetry(new RetryStrategyOptions
|
||
{
|
||
MaxRetryAttempts = retryCount,
|
||
BackoffType = DelayBackoffType.Exponential,
|
||
UseJitter = true,
|
||
Delay = TimeSpan.FromMilliseconds(100),
|
||
MaxDelay = TimeSpan.FromSeconds(1),
|
||
// Handle ALL exceptions including OperationCanceledException. A SQL command-level
|
||
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
|
||
// when the caller's token is NOT cancelled, and must be retried just like any other
|
||
// transient error. Polly itself checks the cancellation token between retries and
|
||
// stops with OperationCanceledException on genuine caller cancellation regardless of
|
||
// this predicate.
|
||
ShouldHandle = new PredicateBuilder().Handle<Exception>(),
|
||
});
|
||
}
|
||
|
||
_pipeline = builder.Build();
|
||
}
|
||
|
||
/// <summary>
|
||
/// Configuration-010: redact connection-string fragments (Password, User Id, Pwd, etc.)
|
||
/// that a caller's exception message could carry. Conservative regex pass — anything
|
||
/// matching <c>Key=Value</c> with a known credential key gets its value replaced.
|
||
/// </summary>
|
||
private static readonly Regex SecretsRegex = new(
|
||
@"(?ix)\b(Password|Pwd|User\s*Id|Uid|AccessToken|Authorization|Api[-_]?Key)\s*=\s*[^;,)\s]*",
|
||
RegexOptions.Compiled);
|
||
|
||
internal static string ScrubSecrets(string? message)
|
||
{
|
||
if (string.IsNullOrEmpty(message)) return message ?? string.Empty;
|
||
// Replace the entire matched fragment (key + value) with a redaction marker so the
|
||
// key name itself doesn't leak — log scrapers grep for "Password=" too.
|
||
return SecretsRegex.Replace(message, "[redacted credential]");
|
||
}
|
||
|
||
/// <summary>
|
||
/// Execute <paramref name="centralFetch"/> through the resilience pipeline. On full failure
|
||
/// (post-retry), reads the sealed cache for <paramref name="clusterId"/> and passes the
|
||
/// snapshot to <paramref name="fromSnapshot"/> to extract the requested shape.
|
||
/// </summary>
|
||
public async ValueTask<T> ReadAsync<T>(
|
||
string clusterId,
|
||
Func<CancellationToken, ValueTask<T>> centralFetch,
|
||
Func<GenerationSnapshot, T> fromSnapshot,
|
||
CancellationToken cancellationToken)
|
||
{
|
||
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
|
||
ArgumentNullException.ThrowIfNull(centralFetch);
|
||
ArgumentNullException.ThrowIfNull(fromSnapshot);
|
||
|
||
try
|
||
{
|
||
var result = await _pipeline.ExecuteAsync(centralFetch, cancellationToken).ConfigureAwait(false);
|
||
_staleFlag.MarkFresh();
|
||
return result;
|
||
}
|
||
// Catch all exceptions that are NOT genuine caller cancellations. A SQL command-level
|
||
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
|
||
// but the caller's token is NOT cancelled — we must fall back to the sealed cache for
|
||
// that case, not propagate. Only rethrow if the caller actually requested cancellation.
|
||
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
|
||
{
|
||
// Configuration-010: do NOT pass the raw exception object — it carries the stack
|
||
// and inner-exception chain, and SqlException/wrapping delegates can surface
|
||
// connection-string fragments (Password=…, User Id=…) embedded in messages.
|
||
// Log only the exception type and a scrubbed message so secrets stay out of logs.
|
||
_logger.LogWarning(
|
||
"Central-DB read failed after retries ({ExceptionType}: {SanitizedMessage}); falling back to sealed cache for cluster {ClusterId}",
|
||
ex.GetType().Name,
|
||
ScrubSecrets(ex.Message),
|
||
clusterId);
|
||
// GenerationCacheUnavailableException surfaces intentionally — fails the caller's
|
||
// operation. StaleConfigFlag stays unchanged; the flag only flips when we actually
|
||
// served a cache snapshot.
|
||
var snapshot = await _cache.ReadCurrentAsync(clusterId, cancellationToken).ConfigureAwait(false);
|
||
_staleFlag.MarkStale();
|
||
return fromSnapshot(snapshot);
|
||
}
|
||
}
|
||
}
|