fix(commons): OverrideCsvParser — preserve literal mid-field quotes, error on unterminated quoted field (T16 CSV)
This commit is contained in:
@@ -58,7 +58,11 @@ public static class OverrideCsvParser
|
||||
if (string.IsNullOrWhiteSpace(rawLine))
|
||||
continue;
|
||||
|
||||
var fields = SplitFields(rawLine);
|
||||
if (!SplitFields(rawLine, out var fields))
|
||||
{
|
||||
errors.Add($"Line {lineNumber}: Unterminated quoted field.");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!headerSeen)
|
||||
{
|
||||
@@ -130,16 +134,30 @@ public static class OverrideCsvParser
|
||||
private static string? NullIfEmpty(string field) => field.Length == 0 ? null : field;
|
||||
|
||||
/// <summary>
|
||||
/// RFC-4180-ish field splitter for a single physical line: quoted fields may
|
||||
/// embed commas and doubled quotes (<c>""</c> → <c>"</c>); unquoted fields are
|
||||
/// whitespace-trimmed.
|
||||
/// RFC-4180-ish field splitter for a single physical line. Quoting rules:
|
||||
/// <list type="bullet">
|
||||
/// <item>A field is <i>quoted</i> only if its first non-whitespace char is a
|
||||
/// <c>"</c> (leading whitespace before the opening quote is allowed and
|
||||
/// ignored). Inside a quoted field, commas are literal and <c>""</c> is an
|
||||
/// escaped single quote; the closing <c>"</c> must be the last non-whitespace
|
||||
/// char of the field (trailing whitespace after the close is allowed and
|
||||
/// ignored).</item>
|
||||
/// <item>A <c>"</c> appearing anywhere else in an unquoted field (i.e. after
|
||||
/// non-whitespace content) is a <b>literal</b> character and is preserved.</item>
|
||||
/// <item>Unquoted fields are whitespace-trimmed; quoted field values are kept
|
||||
/// verbatim.</item>
|
||||
/// </list>
|
||||
/// Returns <c>true</c> with the split <paramref name="fields"/> on success;
|
||||
/// returns <c>false</c> when a quoted field is opened but never closed before
|
||||
/// end-of-line (the caller emits a per-line "unterminated" error).
|
||||
/// </summary>
|
||||
private static List<string> SplitFields(string line)
|
||||
private static bool SplitFields(string line, out List<string> fields)
|
||||
{
|
||||
var fields = new List<string>();
|
||||
fields = new List<string>();
|
||||
var field = new System.Text.StringBuilder();
|
||||
var inQuotes = false;
|
||||
var quoted = false; // this field was (at least partly) quoted → preserve whitespace
|
||||
var inQuotes = false; // currently between an opening and closing quote
|
||||
var quoted = false; // this field opened with a quote → keep value verbatim
|
||||
var sawContent = false; // any non-whitespace char seen in the current field yet
|
||||
|
||||
for (var i = 0; i < line.Length; i++)
|
||||
{
|
||||
@@ -157,7 +175,7 @@ public static class OverrideCsvParser
|
||||
}
|
||||
else
|
||||
{
|
||||
inQuotes = false;
|
||||
inQuotes = false; // closing quote; only trailing whitespace may follow
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -170,23 +188,40 @@ public static class OverrideCsvParser
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '"':
|
||||
inQuotes = true;
|
||||
quoted = true;
|
||||
break;
|
||||
case ',':
|
||||
fields.Add(Finalize(field, quoted));
|
||||
field.Clear();
|
||||
inQuotes = false;
|
||||
quoted = false;
|
||||
sawContent = false;
|
||||
break;
|
||||
case '"' when !sawContent:
|
||||
// Opening quote: first non-whitespace char of the field. Any
|
||||
// leading whitespace seen so far is part of the (ignored) prefix.
|
||||
field.Clear();
|
||||
inQuotes = true;
|
||||
quoted = true;
|
||||
sawContent = true;
|
||||
break;
|
||||
default:
|
||||
// After a quoted field has closed, only whitespace may appear
|
||||
// before the next delimiter — it is ignored, not appended.
|
||||
if (quoted)
|
||||
break;
|
||||
|
||||
// A '"' here (sawContent already true) falls through as a literal.
|
||||
if (!char.IsWhiteSpace(c))
|
||||
sawContent = true;
|
||||
field.Append(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (inQuotes)
|
||||
return false; // opened a quoted field that was never closed
|
||||
|
||||
fields.Add(Finalize(field, quoted));
|
||||
return fields;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static string Finalize(System.Text.StringBuilder field, bool quoted)
|
||||
|
||||
@@ -169,4 +169,80 @@ public class OverrideCsvParserTests
|
||||
Assert.Equal("42", result.Rows[0].Value);
|
||||
Assert.Equal(" spaced ", result.Rows[1].Value);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Parse_MidFieldUnquotedQuote_IsPreservedAsLiteral()
|
||||
{
|
||||
// A '"' that does NOT open a field (it follows non-whitespace content in an
|
||||
// unquoted field) is a literal character — 'va"lue' must survive intact.
|
||||
const string csv = "AttributeName,Value,ElementType\nName,va\"lue,Type\n";
|
||||
|
||||
var result = OverrideCsvParser.Parse(csv);
|
||||
|
||||
Assert.Empty(result.Errors);
|
||||
var row = Assert.Single(result.Rows);
|
||||
Assert.Equal("Name", row.AttributeName);
|
||||
Assert.Equal("va\"lue", row.Value);
|
||||
Assert.Equal("Type", row.ElementType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Parse_UnterminatedQuotedField_ProducesLineNumberedErrorAndExcludesRow()
|
||||
{
|
||||
// A quote opens the field but is never closed before end-of-line → malformed.
|
||||
const string csv = "AttributeName,Value\nName,\"unclosed\n";
|
||||
|
||||
var result = OverrideCsvParser.Parse(csv);
|
||||
|
||||
Assert.Empty(result.Rows);
|
||||
var error = Assert.Single(result.Errors);
|
||||
Assert.Contains("2", error);
|
||||
Assert.Contains("Unterminated", error, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Parse_WellFormedQuotedFieldWithComma_StillParses()
|
||||
{
|
||||
// Regression guard alongside the unterminated-quote fix: a properly closed
|
||||
// quoted field embedding a comma must still round-trip.
|
||||
const string csv = "AttributeName,Value,ElementType\nName,\"a,b\",Type\n";
|
||||
|
||||
var result = OverrideCsvParser.Parse(csv);
|
||||
|
||||
Assert.Empty(result.Errors);
|
||||
var row = Assert.Single(result.Rows);
|
||||
Assert.Equal("Name", row.AttributeName);
|
||||
Assert.Equal("a,b", row.Value);
|
||||
Assert.Equal("Type", row.ElementType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Parse_EmptyQuotedField_YieldsNullValue()
|
||||
{
|
||||
// A bare "" is an empty quoted field; empty → null per the empty-field rule.
|
||||
const string csv = "AttributeName,Value,ElementType\nName,\"\",Type\n";
|
||||
|
||||
var result = OverrideCsvParser.Parse(csv);
|
||||
|
||||
Assert.Empty(result.Errors);
|
||||
var row = Assert.Single(result.Rows);
|
||||
Assert.Equal("Name", row.AttributeName);
|
||||
Assert.Null(row.Value);
|
||||
Assert.Equal("Type", row.ElementType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Parse_QuotedFieldWithTrailingWhitespaceAfterClose_IsAccepted()
|
||||
{
|
||||
// The closing quote may be followed by ignorable trailing whitespace before
|
||||
// the delimiter; the field value itself is preserved verbatim.
|
||||
const string csv = "AttributeName,Value,ElementType\nName,\"a,b\" ,Type\n";
|
||||
|
||||
var result = OverrideCsvParser.Parse(csv);
|
||||
|
||||
Assert.Empty(result.Errors);
|
||||
var row = Assert.Single(result.Rows);
|
||||
Assert.Equal("a,b", row.Value);
|
||||
Assert.Equal("Type", row.ElementType);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user