Files
lmxopcua/src/Core/ZB.MOM.WW.OtOpcUa.Configuration/LocalCache/ResilientConfigReader.cs
T
Joseph Doherty b92fea15d4 fix(configuration): resolve Low code-review findings (Configuration-004,005,007,010,011)
- Configuration-004: NodePermissions stored as int to match the EF
  HasConversion<int>() in OtOpcUaConfigDbContext.ConfigureNodeAcl.
- Configuration-005: serialise LiteDbConfigCache.PutAsync so concurrent
  Put for the same (ClusterId, GenerationId) cannot duplicate rows.
- Configuration-007: rethrow OperationCanceledException from
  GenerationApplier.ApplyPass when the caller's token is cancelled.
- Configuration-010: scrub secrets and drop the full exception object
  from the ResilientConfigReader fallback warning log.
- Configuration-011: pin the previously-uncovered GenerationApplier
  cancellation and path-length / publish-validation paths.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 05:38:18 -04:00

127 lines
6.1 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Polly;
using Polly.Retry;
using Polly.Timeout;
namespace ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
/// <summary>
/// Wraps a central-DB fetch function with Phase 6.1 Stream D.2 resilience:
/// <b>timeout 2 s → retry 3× jittered → fallback to sealed cache</b>. Maintains the
/// <see cref="StaleConfigFlag"/> — fresh on central-DB success, stale on cache fallback.
/// </summary>
/// <remarks>
/// <para>Read-path only per plan. The write path (draft save, publish) bypasses this
/// wrapper entirely and fails hard on DB outage so inconsistent writes never land.</para>
///
/// <para>Fallback is triggered by <b>any exception</b> the fetch raises (central-DB
/// unreachable, SqlException, timeout). If the sealed cache also fails (no pointer,
/// corrupt file, etc.), <see cref="GenerationCacheUnavailableException"/> surfaces — caller
/// must fail the current request (InitializeAsync for a driver, etc.).</para>
/// </remarks>
public sealed class ResilientConfigReader
{
private readonly GenerationSealedCache _cache;
private readonly StaleConfigFlag _staleFlag;
private readonly ResiliencePipeline _pipeline;
private readonly ILogger<ResilientConfigReader> _logger;
public ResilientConfigReader(
GenerationSealedCache cache,
StaleConfigFlag staleFlag,
ILogger<ResilientConfigReader> logger,
TimeSpan? timeout = null,
int retryCount = 3)
{
_cache = cache;
_staleFlag = staleFlag;
_logger = logger;
var builder = new ResiliencePipelineBuilder()
.AddTimeout(new TimeoutStrategyOptions { Timeout = timeout ?? TimeSpan.FromSeconds(2) });
if (retryCount > 0)
{
builder.AddRetry(new RetryStrategyOptions
{
MaxRetryAttempts = retryCount,
BackoffType = DelayBackoffType.Exponential,
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(1),
// Handle ALL exceptions including OperationCanceledException. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// when the caller's token is NOT cancelled, and must be retried just like any other
// transient error. Polly itself checks the cancellation token between retries and
// stops with OperationCanceledException on genuine caller cancellation regardless of
// this predicate.
ShouldHandle = new PredicateBuilder().Handle<Exception>(),
});
}
_pipeline = builder.Build();
}
/// <summary>
/// Configuration-010: redact connection-string fragments (Password, User Id, Pwd, etc.)
/// that a caller's exception message could carry. Conservative regex pass — anything
/// matching <c>Key=Value</c> with a known credential key gets its value replaced.
/// </summary>
private static readonly Regex SecretsRegex = new(
@"(?ix)\b(Password|Pwd|User\s*Id|Uid|AccessToken|Authorization|Api[-_]?Key)\s*=\s*[^;,)\s]*",
RegexOptions.Compiled);
internal static string ScrubSecrets(string? message)
{
if (string.IsNullOrEmpty(message)) return message ?? string.Empty;
// Replace the entire matched fragment (key + value) with a redaction marker so the
// key name itself doesn't leak — log scrapers grep for "Password=" too.
return SecretsRegex.Replace(message, "[redacted credential]");
}
/// <summary>
/// Execute <paramref name="centralFetch"/> through the resilience pipeline. On full failure
/// (post-retry), reads the sealed cache for <paramref name="clusterId"/> and passes the
/// snapshot to <paramref name="fromSnapshot"/> to extract the requested shape.
/// </summary>
public async ValueTask<T> ReadAsync<T>(
string clusterId,
Func<CancellationToken, ValueTask<T>> centralFetch,
Func<GenerationSnapshot, T> fromSnapshot,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
ArgumentNullException.ThrowIfNull(centralFetch);
ArgumentNullException.ThrowIfNull(fromSnapshot);
try
{
var result = await _pipeline.ExecuteAsync(centralFetch, cancellationToken).ConfigureAwait(false);
_staleFlag.MarkFresh();
return result;
}
// Catch all exceptions that are NOT genuine caller cancellations. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// but the caller's token is NOT cancelled — we must fall back to the sealed cache for
// that case, not propagate. Only rethrow if the caller actually requested cancellation.
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
// Configuration-010: do NOT pass the raw exception object — it carries the stack
// and inner-exception chain, and SqlException/wrapping delegates can surface
// connection-string fragments (Password=…, User Id=…) embedded in messages.
// Log only the exception type and a scrubbed message so secrets stay out of logs.
_logger.LogWarning(
"Central-DB read failed after retries ({ExceptionType}: {SanitizedMessage}); falling back to sealed cache for cluster {ClusterId}",
ex.GetType().Name,
ScrubSecrets(ex.Message),
clusterId);
// GenerationCacheUnavailableException surfaces intentionally — fails the caller's
// operation. StaleConfigFlag stays unchanged; the flag only flips when we actually
// served a cache snapshot.
var snapshot = await _cache.ReadCurrentAsync(clusterId, cancellationToken).ConfigureAwait(false);
_staleFlag.MarkStale();
return fromSnapshot(snapshot);
}
}
}