fix(configuration): resolve Low code-review findings (Configuration-004,005,007,010,011)

- Configuration-004: NodePermissions stored as int to match the EF
  HasConversion<int>() in OtOpcUaConfigDbContext.ConfigureNodeAcl.
- Configuration-005: serialise LiteDbConfigCache.PutAsync so concurrent
  Put for the same (ClusterId, GenerationId) cannot duplicate rows.
- Configuration-007: rethrow OperationCanceledException from
  GenerationApplier.ApplyPass when the caller's token is cancelled.
- Configuration-010: scrub secrets and drop the full exception object
  from the ResilientConfigReader fallback warning log.
- Configuration-011: pin the previously-uncovered GenerationApplier
  cancellation and path-length / publish-validation paths.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 05:38:18 -04:00
parent 8be6afbda4
commit b92fea15d4
10 changed files with 327 additions and 27 deletions

View File

@@ -19,6 +19,10 @@ public sealed class GenerationApplier(ApplyCallbacks callbacks) : IGenerationApp
foreach (var kind in new[] { ChangeKind.Added, ChangeKind.Modified })
{
// Honour cancellation between passes — a caller can abort the apply between Removed
// and Added phases even if individual callbacks don't observe the token themselves
// (Configuration-007).
ct.ThrowIfCancellationRequested();
await ApplyPass(diff.Namespaces, kind, callbacks.OnNamespace, errors, ct);
await ApplyPass(diff.Drivers, kind, callbacks.OnDriver, errors, ct);
await ApplyPass(diff.Devices, kind, callbacks.OnDevice, errors, ct);
@@ -42,6 +46,12 @@ public sealed class GenerationApplier(ApplyCallbacks callbacks) : IGenerationApp
foreach (var change in changes.Where(c => c.Kind == kind))
{
try { await callback(change, ct); }
// Configuration-007: cancellation must propagate, not be silently recorded as an
// entity error. Distinguish caller cancellation (token signalled) from any
// OperationCanceledException raised independently of the caller's token, which we
// still want to surface as an entity error so a single misbehaving callback does
// not crash the entire apply.
catch (OperationCanceledException) when (ct.IsCancellationRequested) { throw; }
catch (Exception ex) { errors.Add($"{typeof(T).Name} {change.Kind} '{change.LogicalId}': {ex.Message}"); }
}
}

View File

@@ -5,7 +5,7 @@ namespace ZB.MOM.WW.OtOpcUa.Configuration.Enums;
/// Stored as <c>int</c> bitmask in <see cref="Entities.NodeAcl.PermissionFlags"/>.
/// </summary>
[Flags]
public enum NodePermissions : uint
public enum NodePermissions : int
{
None = 0,

View File

@@ -4,6 +4,13 @@ namespace ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
/// Per-node local cache of the most-recently-applied generation(s). Used to bootstrap the
/// address space when the central DB is unreachable (decision #79 — degraded-but-running).
/// </summary>
/// <remarks>
/// <para><b>Concurrency contract:</b> implementations must serialize writes — specifically,
/// <see cref="PutAsync"/> for the same <c>(ClusterId, GenerationId)</c> from concurrent
/// callers must not produce duplicate rows. Reads may run concurrently with reads and writes.
/// The <see cref="LiteDbConfigCache"/> implementation enforces this via an instance-level
/// <see cref="SemaphoreSlim"/> around the find-then-insert/update window.</para>
/// </remarks>
public interface ILocalConfigCache
{
Task<GenerationSnapshot?> GetMostRecentAsync(string clusterId, CancellationToken ct = default);

View File

@@ -13,6 +13,12 @@ public sealed class LiteDbConfigCache : ILocalConfigCache, IDisposable
private const string CollectionName = "generations";
private readonly LiteDatabase _db;
private readonly ILiteCollection<GenerationSnapshot> _col;
// PutAsync is a find-then-insert/update; without serialization, two concurrent puts for the
// same (ClusterId, GenerationId) can both observe `existing is null` and both Insert,
// producing duplicate rows (Configuration-005). Serialize writes through this semaphore so
// the read-modify-write block is atomic for a given instance. LiteDB itself only locks the
// page-level write, not the find-then-insert window.
private readonly SemaphoreSlim _writeGate = new(initialCount: 1, maxCount: 1);
public LiteDbConfigCache(string dbPath)
{
@@ -47,23 +53,32 @@ public sealed class LiteDbConfigCache : ILocalConfigCache, IDisposable
return Task.FromResult<GenerationSnapshot?>(snapshot);
}
public Task PutAsync(GenerationSnapshot snapshot, CancellationToken ct = default)
public async Task PutAsync(GenerationSnapshot snapshot, CancellationToken ct = default)
{
ct.ThrowIfCancellationRequested();
// upsert by (ClusterId, GenerationId) — replace in place if already cached
var existing = _col
.Find(s => s.ClusterId == snapshot.ClusterId && s.GenerationId == snapshot.GenerationId)
.FirstOrDefault();
if (existing is null)
_col.Insert(snapshot);
else
// Serialize the find-then-insert/update so concurrent callers do not observe a stale
// `existing is null` and both Insert (Configuration-005). LiteDB's per-call lock is
// not enough — the read and the write are independent calls.
await _writeGate.WaitAsync(ct).ConfigureAwait(false);
try
{
snapshot.Id = existing.Id;
_col.Update(snapshot);
}
// upsert by (ClusterId, GenerationId) — replace in place if already cached
var existing = _col
.Find(s => s.ClusterId == snapshot.ClusterId && s.GenerationId == snapshot.GenerationId)
.FirstOrDefault();
return Task.CompletedTask;
if (existing is null)
_col.Insert(snapshot);
else
{
snapshot.Id = existing.Id;
_col.Update(snapshot);
}
}
finally
{
_writeGate.Release();
}
}
public Task PruneOldGenerationsAsync(string clusterId, int keepLatest = 10, CancellationToken ct = default)
@@ -82,7 +97,11 @@ public sealed class LiteDbConfigCache : ILocalConfigCache, IDisposable
return Task.CompletedTask;
}
public void Dispose() => _db.Dispose();
public void Dispose()
{
_writeGate.Dispose();
_db.Dispose();
}
}
public sealed class LocalConfigCacheCorruptException(string message, Exception inner)

View File

@@ -1,3 +1,4 @@
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Polly;
using Polly.Retry;
@@ -61,6 +62,23 @@ public sealed class ResilientConfigReader
_pipeline = builder.Build();
}
/// <summary>
/// Configuration-010: redact connection-string fragments (Password, User Id, Pwd, etc.)
/// that a caller's exception message could carry. Conservative regex pass — anything
/// matching <c>Key=Value</c> with a known credential key gets its value replaced.
/// </summary>
private static readonly Regex SecretsRegex = new(
@"(?ix)\b(Password|Pwd|User\s*Id|Uid|AccessToken|Authorization|Api[-_]?Key)\s*=\s*[^;,)\s]*",
RegexOptions.Compiled);
internal static string ScrubSecrets(string? message)
{
if (string.IsNullOrEmpty(message)) return message ?? string.Empty;
// Replace the entire matched fragment (key + value) with a redaction marker so the
// key name itself doesn't leak — log scrapers grep for "Password=" too.
return SecretsRegex.Replace(message, "[redacted credential]");
}
/// <summary>
/// Execute <paramref name="centralFetch"/> through the resilience pipeline. On full failure
/// (post-retry), reads the sealed cache for <paramref name="clusterId"/> and passes the
@@ -88,7 +106,15 @@ public sealed class ResilientConfigReader
// that case, not propagate. Only rethrow if the caller actually requested cancellation.
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
_logger.LogWarning(ex, "Central-DB read failed after retries; falling back to sealed cache for cluster {ClusterId}", clusterId);
// Configuration-010: do NOT pass the raw exception object — it carries the stack
// and inner-exception chain, and SqlException/wrapping delegates can surface
// connection-string fragments (Password=…, User Id=…) embedded in messages.
// Log only the exception type and a scrubbed message so secrets stay out of logs.
_logger.LogWarning(
"Central-DB read failed after retries ({ExceptionType}: {SanitizedMessage}); falling back to sealed cache for cluster {ClusterId}",
ex.GetType().Name,
ScrubSecrets(ex.Message),
clusterId);
// GenerationCacheUnavailableException surfaces intentionally — fails the caller's
// operation. StaleConfigFlag stays unchanged; the flag only flips when we actually
// served a cache snapshot.