fix: read file size after streams are closed in converter
This commit is contained in:
@@ -102,47 +102,49 @@ await Parallel.ForEachAsync(jsonFiles, options, async (jsonFile, cancellationTok
|
||||
dataTable.Columns.Add(colName, colType);
|
||||
}
|
||||
|
||||
// Stream JSON and write to protobuf in batches
|
||||
await using var inputFs = new FileStream(jsonFile, FileMode.Open, FileAccess.Read, FileShare.Read, 256 * 1024, FileOptions.SequentialScan | FileOptions.Asynchronous);
|
||||
await using var decompressStream = new DecompressionStream(inputFs);
|
||||
await using var outputFs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.None, 256 * 1024, FileOptions.Asynchronous);
|
||||
await using var compressStream = new CompressionStream(outputFs, level: 10);
|
||||
|
||||
int rowCount = 0;
|
||||
int batchCount = 0;
|
||||
|
||||
// True streaming: DeserializeAsyncEnumerable streams each array element without loading entire JSON
|
||||
var jsonOptions = new JsonSerializerOptions { PropertyNameCaseInsensitive = true };
|
||||
await foreach (var element in JsonSerializer.DeserializeAsyncEnumerable<JsonElement>(
|
||||
decompressStream,
|
||||
jsonOptions,
|
||||
cancellationToken))
|
||||
// Stream JSON and write to protobuf in batches
|
||||
// Use explicit dispose to ensure file is closed before reading size
|
||||
{
|
||||
var row = dataTable.NewRow();
|
||||
ReadJsonElement(element, row, dataTable);
|
||||
dataTable.Rows.Add(row);
|
||||
rowCount++;
|
||||
await using var inputFs = new FileStream(jsonFile, FileMode.Open, FileAccess.Read, FileShare.Read, 256 * 1024, FileOptions.SequentialScan | FileOptions.Asynchronous);
|
||||
await using var decompressStream = new DecompressionStream(inputFs);
|
||||
await using var outputFs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.None, 256 * 1024, FileOptions.Asynchronous);
|
||||
await using var compressStream = new CompressionStream(outputFs, level: 10);
|
||||
|
||||
// Write batch when we hit the batch size
|
||||
if (dataTable.Rows.Count >= BatchSize)
|
||||
// True streaming: DeserializeAsyncEnumerable streams each array element without loading entire JSON
|
||||
var jsonOptions = new JsonSerializerOptions { PropertyNameCaseInsensitive = true };
|
||||
await foreach (var element in JsonSerializer.DeserializeAsyncEnumerable<JsonElement>(
|
||||
decompressStream,
|
||||
jsonOptions,
|
||||
cancellationToken))
|
||||
{
|
||||
var row = dataTable.NewRow();
|
||||
ReadJsonElement(element, row, dataTable);
|
||||
dataTable.Rows.Add(row);
|
||||
rowCount++;
|
||||
|
||||
// Write batch when we hit the batch size
|
||||
if (dataTable.Rows.Count >= BatchSize)
|
||||
{
|
||||
using var reader = dataTable.CreateDataReader();
|
||||
DataSerializer.Serialize(compressStream, reader);
|
||||
dataTable.Clear();
|
||||
batchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Write remaining rows
|
||||
if (dataTable.Rows.Count > 0)
|
||||
{
|
||||
using var reader = dataTable.CreateDataReader();
|
||||
DataSerializer.Serialize(compressStream, reader);
|
||||
dataTable.Clear();
|
||||
batchCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Write remaining rows
|
||||
if (dataTable.Rows.Count > 0)
|
||||
{
|
||||
using var reader = dataTable.CreateDataReader();
|
||||
DataSerializer.Serialize(compressStream, reader);
|
||||
batchCount++;
|
||||
}
|
||||
|
||||
await compressStream.FlushAsync(cancellationToken);
|
||||
} // Streams closed here
|
||||
|
||||
// Read file size after streams are fully closed
|
||||
var newSize = new FileInfo(outputFile).Length;
|
||||
Interlocked.Add(ref totalNewSize, newSize);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user