fix(commons): OverrideCsvParser — preserve literal mid-field quotes, error on unterminated quoted field (T16 CSV)

This commit is contained in:
Joseph Doherty
2026-06-18 02:13:10 -04:00
parent c799f41d53
commit 6a6f8949b9
2 changed files with 125 additions and 14 deletions
@@ -58,7 +58,11 @@ public static class OverrideCsvParser
if (string.IsNullOrWhiteSpace(rawLine))
continue;
var fields = SplitFields(rawLine);
if (!SplitFields(rawLine, out var fields))
{
errors.Add($"Line {lineNumber}: Unterminated quoted field.");
continue;
}
if (!headerSeen)
{
@@ -130,16 +134,30 @@ public static class OverrideCsvParser
private static string? NullIfEmpty(string field) => field.Length == 0 ? null : field;
/// <summary>
/// RFC-4180-ish field splitter for a single physical line: quoted fields may
/// embed commas and doubled quotes (<c>""</c> → <c>"</c>); unquoted fields are
/// whitespace-trimmed.
/// RFC-4180-ish field splitter for a single physical line. Quoting rules:
/// <list type="bullet">
/// <item>A field is <i>quoted</i> only if its first non-whitespace char is a
/// <c>"</c> (leading whitespace before the opening quote is allowed and
/// ignored). Inside a quoted field, commas are literal and <c>""</c> is an
/// escaped single quote; the closing <c>"</c> must be the last non-whitespace
/// char of the field (trailing whitespace after the close is allowed and
/// ignored).</item>
/// <item>A <c>"</c> appearing anywhere else in an unquoted field (i.e. after
/// non-whitespace content) is a <b>literal</b> character and is preserved.</item>
/// <item>Unquoted fields are whitespace-trimmed; quoted field values are kept
/// verbatim.</item>
/// </list>
/// Returns <c>true</c> with the split <paramref name="fields"/> on success;
/// returns <c>false</c> when a quoted field is opened but never closed before
/// end-of-line (the caller emits a per-line "unterminated" error).
/// </summary>
private static List<string> SplitFields(string line)
private static bool SplitFields(string line, out List<string> fields)
{
var fields = new List<string>();
fields = new List<string>();
var field = new System.Text.StringBuilder();
var inQuotes = false;
var quoted = false; // this field was (at least partly) quotedpreserve whitespace
var inQuotes = false; // currently between an opening and closing quote
var quoted = false; // this field opened with a quote → keep value verbatim
var sawContent = false; // any non-whitespace char seen in the current field yet
for (var i = 0; i < line.Length; i++)
{
@@ -157,7 +175,7 @@ public static class OverrideCsvParser
}
else
{
inQuotes = false;
inQuotes = false; // closing quote; only trailing whitespace may follow
}
}
else
@@ -170,23 +188,40 @@ public static class OverrideCsvParser
switch (c)
{
case '"':
inQuotes = true;
quoted = true;
break;
case ',':
fields.Add(Finalize(field, quoted));
field.Clear();
inQuotes = false;
quoted = false;
sawContent = false;
break;
case '"' when !sawContent:
// Opening quote: first non-whitespace char of the field. Any
// leading whitespace seen so far is part of the (ignored) prefix.
field.Clear();
inQuotes = true;
quoted = true;
sawContent = true;
break;
default:
// After a quoted field has closed, only whitespace may appear
// before the next delimiter — it is ignored, not appended.
if (quoted)
break;
// A '"' here (sawContent already true) falls through as a literal.
if (!char.IsWhiteSpace(c))
sawContent = true;
field.Append(c);
break;
}
}
if (inQuotes)
return false; // opened a quoted field that was never closed
fields.Add(Finalize(field, quoted));
return fields;
return true;
}
private static string Finalize(System.Text.StringBuilder field, bool quoted)