fix(commons): OverrideCsvParser — preserve literal mid-field quotes, error on unterminated quoted field (T16 CSV)
This commit is contained in:
@@ -58,7 +58,11 @@ public static class OverrideCsvParser
|
|||||||
if (string.IsNullOrWhiteSpace(rawLine))
|
if (string.IsNullOrWhiteSpace(rawLine))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
var fields = SplitFields(rawLine);
|
if (!SplitFields(rawLine, out var fields))
|
||||||
|
{
|
||||||
|
errors.Add($"Line {lineNumber}: Unterminated quoted field.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!headerSeen)
|
if (!headerSeen)
|
||||||
{
|
{
|
||||||
@@ -130,16 +134,30 @@ public static class OverrideCsvParser
|
|||||||
private static string? NullIfEmpty(string field) => field.Length == 0 ? null : field;
|
private static string? NullIfEmpty(string field) => field.Length == 0 ? null : field;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// RFC-4180-ish field splitter for a single physical line: quoted fields may
|
/// RFC-4180-ish field splitter for a single physical line. Quoting rules:
|
||||||
/// embed commas and doubled quotes (<c>""</c> → <c>"</c>); unquoted fields are
|
/// <list type="bullet">
|
||||||
/// whitespace-trimmed.
|
/// <item>A field is <i>quoted</i> only if its first non-whitespace char is a
|
||||||
|
/// <c>"</c> (leading whitespace before the opening quote is allowed and
|
||||||
|
/// ignored). Inside a quoted field, commas are literal and <c>""</c> is an
|
||||||
|
/// escaped single quote; the closing <c>"</c> must be the last non-whitespace
|
||||||
|
/// char of the field (trailing whitespace after the close is allowed and
|
||||||
|
/// ignored).</item>
|
||||||
|
/// <item>A <c>"</c> appearing anywhere else in an unquoted field (i.e. after
|
||||||
|
/// non-whitespace content) is a <b>literal</b> character and is preserved.</item>
|
||||||
|
/// <item>Unquoted fields are whitespace-trimmed; quoted field values are kept
|
||||||
|
/// verbatim.</item>
|
||||||
|
/// </list>
|
||||||
|
/// Returns <c>true</c> with the split <paramref name="fields"/> on success;
|
||||||
|
/// returns <c>false</c> when a quoted field is opened but never closed before
|
||||||
|
/// end-of-line (the caller emits a per-line "unterminated" error).
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private static List<string> SplitFields(string line)
|
private static bool SplitFields(string line, out List<string> fields)
|
||||||
{
|
{
|
||||||
var fields = new List<string>();
|
fields = new List<string>();
|
||||||
var field = new System.Text.StringBuilder();
|
var field = new System.Text.StringBuilder();
|
||||||
var inQuotes = false;
|
var inQuotes = false; // currently between an opening and closing quote
|
||||||
var quoted = false; // this field was (at least partly) quoted → preserve whitespace
|
var quoted = false; // this field opened with a quote → keep value verbatim
|
||||||
|
var sawContent = false; // any non-whitespace char seen in the current field yet
|
||||||
|
|
||||||
for (var i = 0; i < line.Length; i++)
|
for (var i = 0; i < line.Length; i++)
|
||||||
{
|
{
|
||||||
@@ -157,7 +175,7 @@ public static class OverrideCsvParser
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
inQuotes = false;
|
inQuotes = false; // closing quote; only trailing whitespace may follow
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -170,23 +188,40 @@ public static class OverrideCsvParser
|
|||||||
|
|
||||||
switch (c)
|
switch (c)
|
||||||
{
|
{
|
||||||
case '"':
|
|
||||||
inQuotes = true;
|
|
||||||
quoted = true;
|
|
||||||
break;
|
|
||||||
case ',':
|
case ',':
|
||||||
fields.Add(Finalize(field, quoted));
|
fields.Add(Finalize(field, quoted));
|
||||||
field.Clear();
|
field.Clear();
|
||||||
|
inQuotes = false;
|
||||||
quoted = false;
|
quoted = false;
|
||||||
|
sawContent = false;
|
||||||
|
break;
|
||||||
|
case '"' when !sawContent:
|
||||||
|
// Opening quote: first non-whitespace char of the field. Any
|
||||||
|
// leading whitespace seen so far is part of the (ignored) prefix.
|
||||||
|
field.Clear();
|
||||||
|
inQuotes = true;
|
||||||
|
quoted = true;
|
||||||
|
sawContent = true;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
// After a quoted field has closed, only whitespace may appear
|
||||||
|
// before the next delimiter — it is ignored, not appended.
|
||||||
|
if (quoted)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// A '"' here (sawContent already true) falls through as a literal.
|
||||||
|
if (!char.IsWhiteSpace(c))
|
||||||
|
sawContent = true;
|
||||||
field.Append(c);
|
field.Append(c);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (inQuotes)
|
||||||
|
return false; // opened a quoted field that was never closed
|
||||||
|
|
||||||
fields.Add(Finalize(field, quoted));
|
fields.Add(Finalize(field, quoted));
|
||||||
return fields;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string Finalize(System.Text.StringBuilder field, bool quoted)
|
private static string Finalize(System.Text.StringBuilder field, bool quoted)
|
||||||
|
|||||||
@@ -169,4 +169,80 @@ public class OverrideCsvParserTests
|
|||||||
Assert.Equal("42", result.Rows[0].Value);
|
Assert.Equal("42", result.Rows[0].Value);
|
||||||
Assert.Equal(" spaced ", result.Rows[1].Value);
|
Assert.Equal(" spaced ", result.Rows[1].Value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Parse_MidFieldUnquotedQuote_IsPreservedAsLiteral()
|
||||||
|
{
|
||||||
|
// A '"' that does NOT open a field (it follows non-whitespace content in an
|
||||||
|
// unquoted field) is a literal character — 'va"lue' must survive intact.
|
||||||
|
const string csv = "AttributeName,Value,ElementType\nName,va\"lue,Type\n";
|
||||||
|
|
||||||
|
var result = OverrideCsvParser.Parse(csv);
|
||||||
|
|
||||||
|
Assert.Empty(result.Errors);
|
||||||
|
var row = Assert.Single(result.Rows);
|
||||||
|
Assert.Equal("Name", row.AttributeName);
|
||||||
|
Assert.Equal("va\"lue", row.Value);
|
||||||
|
Assert.Equal("Type", row.ElementType);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Parse_UnterminatedQuotedField_ProducesLineNumberedErrorAndExcludesRow()
|
||||||
|
{
|
||||||
|
// A quote opens the field but is never closed before end-of-line → malformed.
|
||||||
|
const string csv = "AttributeName,Value\nName,\"unclosed\n";
|
||||||
|
|
||||||
|
var result = OverrideCsvParser.Parse(csv);
|
||||||
|
|
||||||
|
Assert.Empty(result.Rows);
|
||||||
|
var error = Assert.Single(result.Errors);
|
||||||
|
Assert.Contains("2", error);
|
||||||
|
Assert.Contains("Unterminated", error, StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Parse_WellFormedQuotedFieldWithComma_StillParses()
|
||||||
|
{
|
||||||
|
// Regression guard alongside the unterminated-quote fix: a properly closed
|
||||||
|
// quoted field embedding a comma must still round-trip.
|
||||||
|
const string csv = "AttributeName,Value,ElementType\nName,\"a,b\",Type\n";
|
||||||
|
|
||||||
|
var result = OverrideCsvParser.Parse(csv);
|
||||||
|
|
||||||
|
Assert.Empty(result.Errors);
|
||||||
|
var row = Assert.Single(result.Rows);
|
||||||
|
Assert.Equal("Name", row.AttributeName);
|
||||||
|
Assert.Equal("a,b", row.Value);
|
||||||
|
Assert.Equal("Type", row.ElementType);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Parse_EmptyQuotedField_YieldsNullValue()
|
||||||
|
{
|
||||||
|
// A bare "" is an empty quoted field; empty → null per the empty-field rule.
|
||||||
|
const string csv = "AttributeName,Value,ElementType\nName,\"\",Type\n";
|
||||||
|
|
||||||
|
var result = OverrideCsvParser.Parse(csv);
|
||||||
|
|
||||||
|
Assert.Empty(result.Errors);
|
||||||
|
var row = Assert.Single(result.Rows);
|
||||||
|
Assert.Equal("Name", row.AttributeName);
|
||||||
|
Assert.Null(row.Value);
|
||||||
|
Assert.Equal("Type", row.ElementType);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Parse_QuotedFieldWithTrailingWhitespaceAfterClose_IsAccepted()
|
||||||
|
{
|
||||||
|
// The closing quote may be followed by ignorable trailing whitespace before
|
||||||
|
// the delimiter; the field value itself is preserved verbatim.
|
||||||
|
const string csv = "AttributeName,Value,ElementType\nName,\"a,b\" ,Type\n";
|
||||||
|
|
||||||
|
var result = OverrideCsvParser.Parse(csv);
|
||||||
|
|
||||||
|
Assert.Empty(result.Errors);
|
||||||
|
var row = Assert.Single(result.Rows);
|
||||||
|
Assert.Equal("a,b", row.Value);
|
||||||
|
Assert.Equal("Type", row.ElementType);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user