Обнаружил для себя, что распарсить многострочный CSV в php не так просто, как казалось. Встроенные функции, типа
str_getcsv и проч, работают только с 1 строкой, но у нас их, как правило, много. Варианты типа разбить строку через
explode или прочитать файл через
file не учитывают того, что в CSV строке вполне может быть перевод строки, например:
"str1
str2"
Воспользовавшись комментарием
https://www.php.net/manual/ru/function.str-getcsv.php#113220 соорудил простой класс для парсинга многострочного CSV:
<?php
declare(strict_types=1);
// https://www.php.net/manual/ru/function.str-getcsv.php#113220
final readonly class CsvParser
{
/**
* parse a CSV file into a two-dimensional array
* this seems as simple as splitting a string by lines and commas, but this only works if tricks are performed
* to ensure that you do NOT split on lines and commas that are inside of double quotes.
*
* @return array<int, string[]>
*/
public function parse(string $str): array
{
// match all the non-quoted text and one series of quoted text (or the end of the string)
// each group of matches will be parsed with the callback, with $matches[1] containing all the non-quoted text,
// and $matches[3] containing everything inside the quotes
$str = \preg_replace_callback('/([^"]*)("((""|[^"])*)"|$)/', $this->parseQuotes(...), $str);
// remove the very last newline to prevent a 0-field array for the last line
$str = \preg_replace('/\n$/', '', $str);
// split on LF and parse each line with a callback
return \array_map($this->parseLine(...), \explode("\n", $str));
}
/**
* replace all the csv-special characters inside double quotes with markers using an escape sequence.
*/
private function parseQuotes(array $matches): string
{
// anything inside the quotes that might be used to split the string into lines and fields later,
// needs to be quoted. The only character we can guarantee as safe to use, because it will never appear in the unquoted text, is a CR
// So we're going to use CR as a marker to make escape sequences for CR, LF, Quotes, and Commas.
$str = $matches[3] ?? '';
$str = \str_replace(["\r", "\n", '""', ','], ["\rR", "\rN", "\rQ", "\rC"], $str);
// The unquoted text is where commas and newlines are allowed, and where the splits will happen
// We're going to remove all CRs from the unquoted text, by normalizing all line endings to just LF
// This ensures us that the only place CR is used, is as the escape sequences for quoted text
return \preg_replace('/\r\n?/', "\n", $matches[1]).$str;
}
/**
* split on comma and parse each field with a callback.
*/
private function parseLine(string $line): array
{
return \array_map($this->parseField(...), \explode(',', $line));
}
/**
* restore any csv-special characters that are part of the data.
*/
private function parseField(string $field): string
{
return \str_replace(["\rC", "\rQ", "\rN", "\rR"], [',', '"', "\n", "\r"], $field);
}
}