using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Polly;
using Polly.Retry;
using Polly.Timeout;
namespace ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
///
/// Wraps a central-DB fetch function with Phase 6.1 Stream D.2 resilience:
/// timeout 2 s → retry 3× jittered → fallback to sealed cache. Maintains the
/// — fresh on central-DB success, stale on cache fallback.
///
///
/// Read-path only per plan. The write path (draft save, publish) bypasses this
/// wrapper entirely and fails hard on DB outage so inconsistent writes never land.
///
/// Fallback is triggered by any exception the fetch raises (central-DB
/// unreachable, SqlException, timeout). If the sealed cache also fails (no pointer,
/// corrupt file, etc.), surfaces — caller
/// must fail the current request (InitializeAsync for a driver, etc.).
///
public sealed class ResilientConfigReader
{
private readonly GenerationSealedCache _cache;
private readonly StaleConfigFlag _staleFlag;
private readonly ResiliencePipeline _pipeline;
private readonly ILogger _logger;
public ResilientConfigReader(
GenerationSealedCache cache,
StaleConfigFlag staleFlag,
ILogger logger,
TimeSpan? timeout = null,
int retryCount = 3)
{
_cache = cache;
_staleFlag = staleFlag;
_logger = logger;
var builder = new ResiliencePipelineBuilder()
.AddTimeout(new TimeoutStrategyOptions { Timeout = timeout ?? TimeSpan.FromSeconds(2) });
if (retryCount > 0)
{
builder.AddRetry(new RetryStrategyOptions
{
MaxRetryAttempts = retryCount,
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(1),
// Handle ALL exceptions including OperationCanceledException. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// when the caller's token is NOT cancelled, and must be retried just like any other
// transient error. Polly itself checks the cancellation token between retries and
// stops with OperationCanceledException on genuine caller cancellation regardless of
// this predicate.
ShouldHandle = new PredicateBuilder().Handle(),
});
}
_pipeline = builder.Build();
}
///
/// Configuration-010: redact connection-string fragments (Password, User Id, Pwd, etc.)
/// that a caller's exception message could carry. Conservative regex pass — anything
/// matching Key=Value with a known credential key gets its value replaced.
///
private static readonly Regex SecretsRegex = new(
@"(?ix)\b(Password|Pwd|User\s*Id|Uid|AccessToken|Authorization|Api[-_]?Key)\s*=\s*[^;,)\s]*",
RegexOptions.Compiled);
internal static string ScrubSecrets(string? message)
{
if (string.IsNullOrEmpty(message)) return message ?? string.Empty;
// Replace the entire matched fragment (key + value) with a redaction marker so the
// key name itself doesn't leak — log scrapers grep for "Password=" too.
return SecretsRegex.Replace(message, "[redacted credential]");
}
///
/// Execute through the resilience pipeline. On full failure
/// (post-retry), reads the sealed cache for and passes the
/// snapshot to to extract the requested shape.
///
public async ValueTask ReadAsync(
string clusterId,
Func> centralFetch,
Func fromSnapshot,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
ArgumentNullException.ThrowIfNull(centralFetch);
ArgumentNullException.ThrowIfNull(fromSnapshot);
try
{
var result = await _pipeline.ExecuteAsync(centralFetch, cancellationToken).ConfigureAwait(false);
_staleFlag.MarkFresh();
return result;
}
// Catch all exceptions that are NOT genuine caller cancellations. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// but the caller's token is NOT cancelled — we must fall back to the sealed cache for
// that case, not propagate. Only rethrow if the caller actually requested cancellation.
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
// Configuration-010: do NOT pass the raw exception object — it carries the stack
// and inner-exception chain, and SqlException/wrapping delegates can surface
// connection-string fragments (Password=…, User Id=…) embedded in messages.
// Log only the exception type and a scrubbed message so secrets stay out of logs.
_logger.LogWarning(
"Central-DB read failed after retries ({ExceptionType}: {SanitizedMessage}); falling back to sealed cache for cluster {ClusterId}",
ex.GetType().Name,
ScrubSecrets(ex.Message),
clusterId);
// GenerationCacheUnavailableException surfaces intentionally — fails the caller's
// operation. StaleConfigFlag stays unchanged; the flag only flips when we actually
// served a cache snapshot.
var snapshot = await _cache.ReadCurrentAsync(clusterId, cancellationToken).ConfigureAwait(false);
_staleFlag.MarkStale();
return fromSnapshot(snapshot);
}
}
}