From 7517cdd008c5fa874ac18180f5835026b6368e2b Mon Sep 17 00:00:00 2001 From: oleibman Date: Sun, 17 May 2020 02:15:18 -0700 Subject: [PATCH] Improve Coverage for CSV (#1475) I believe that both CSV Reader and Writer are 100% covered now. There were some errors uncovered during development. The reader specifically permits encodings other than UTF-8 to be used. However, fgetcsv will not properly handle other encodings. I tried replacing it with fgets/iconv/strgetcsv, but that could not handle line breaks within a cell, even for UTF-8. This is, I'm sure, a very rare use case. I eventually handled it by using php://memory to hold the translated file contents for non-UTF8. There were no tests for this situation, and now there are (probably too many). "Contiguous" read was not handle correctly. There is a file in samples which uses it. It was designed to read a large sheet, and split it into three. The first sheet was corrrect, but the second and third were almost entirely empty. This has been corrected, and the sample code was adapted into a formal test with assertions to confirm that it works as designed. I made a minor documentation change. Unlike HTML, where you never need a BOM because you can declare the encoding in the file, a CSV with non-ASCII characters must explicitly include a BOM for Excel to handle it correctly. This was explained in the Reading CSV section, but was glossed over in the Writing CSV section, which I have updated. --- docs/topics/reading-and-writing-to-file.md | 8 +- .../Basic/13_CalculationCyclicFormulae.php | 2 +- src/PhpSpreadsheet/Reader/Csv.php | 84 +++++-------- src/PhpSpreadsheet/Writer/Csv.php | 7 +- .../Reader/CsvContiguousFilter.php | 57 +++++++++ .../Reader/CsvContiguousTest.php | 81 +++++++++++++ tests/PhpSpreadsheetTests/Reader/CsvTest.php | 110 ++++++++++++++++++ .../Writer/Csv/CsvWriteTest.php | 60 ++++++++++ tests/data/Reader/CSV/encoding.iso88591.csv | 2 + tests/data/Reader/CSV/encoding.utf16be.csv | Bin 0 -> 20 bytes tests/data/Reader/CSV/encoding.utf16le.csv | Bin 0 -> 20 bytes tests/data/Reader/CSV/encoding.utf32be.csv | Bin 0 -> 40 bytes tests/data/Reader/CSV/encoding.utf32le.csv | Bin 0 -> 40 bytes tests/data/Reader/CSV/encoding.utf8.csv | 2 + tests/data/Reader/CSV/encoding.utf8bom.csv | 2 + tests/data/Reader/CSV/sep.csv | 3 + .../CSV/utf16be.line_break_in_enclosure.csv | Bin 0 -> 818 bytes 17 files changed, 358 insertions(+), 60 deletions(-) create mode 100644 tests/PhpSpreadsheetTests/Reader/CsvContiguousFilter.php create mode 100644 tests/PhpSpreadsheetTests/Reader/CsvContiguousTest.php create mode 100644 tests/PhpSpreadsheetTests/Writer/Csv/CsvWriteTest.php create mode 100644 tests/data/Reader/CSV/encoding.iso88591.csv create mode 100644 tests/data/Reader/CSV/encoding.utf16be.csv create mode 100644 tests/data/Reader/CSV/encoding.utf16le.csv create mode 100644 tests/data/Reader/CSV/encoding.utf32be.csv create mode 100644 tests/data/Reader/CSV/encoding.utf32le.csv create mode 100644 tests/data/Reader/CSV/encoding.utf8.csv create mode 100644 tests/data/Reader/CSV/encoding.utf8bom.csv create mode 100644 tests/data/Reader/CSV/sep.csv create mode 100644 tests/data/Reader/CSV/utf16be.line_break_in_enclosure.csv diff --git a/docs/topics/reading-and-writing-to-file.md b/docs/topics/reading-and-writing-to-file.md index 3b6a037c..c0dc7c03 100644 --- a/docs/topics/reading-and-writing-to-file.md +++ b/docs/topics/reading-and-writing-to-file.md @@ -535,8 +535,12 @@ $writer->save("05featuredemo.csv"); #### Writing UTF-8 CSV files -A CSV file can be marked as UTF-8 by writing a BOM file header. This can -be enabled by using the following code: +CSV files are written in UTF-8. If they do not contain characters +outside the ASCII range, nothing else need be done. +However, if such characters are in the file, +it should explicitly include a BOM file header; +if it doesn't, Excel will not interpret those characters correctly. +This can be enabled by using the following code: ``` php $writer = new \PhpOffice\PhpSpreadsheet\Writer\Csv($spreadsheet); diff --git a/samples/Basic/13_CalculationCyclicFormulae.php b/samples/Basic/13_CalculationCyclicFormulae.php index 26e9784d..a446e56e 100644 --- a/samples/Basic/13_CalculationCyclicFormulae.php +++ b/samples/Basic/13_CalculationCyclicFormulae.php @@ -16,7 +16,7 @@ $spreadsheet->getActiveSheet()->setCellValue('A1', '=B1') ->setCellValue('B1', '=A1+1') ->setCellValue('B2', '=A2'); -Calculation::getInstance($spreadsheet)->cyclicFormulaCount = 100; +Calculation::getInstance($spreadsheet)->cyclicFormulaCount = 15; // Calculated data $helper->log('Calculated data'); diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index 4d104595..2e485109 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -43,13 +43,6 @@ class Csv extends BaseReader */ private $contiguous = false; - /** - * Row counter for loading rows contiguously. - * - * @var int - */ - private $contiguousRow = -1; - /** * The character that can escape the enclosure. * @@ -101,28 +94,6 @@ class Csv extends BaseReader fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ? fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0); - break; - case 'UTF-16LE': - fgets($this->fileHandle, 3) == "\xFF\xFE" ? - fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0); - - break; - case 'UTF-16BE': - fgets($this->fileHandle, 3) == "\xFE\xFF" ? - fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0); - - break; - case 'UTF-32LE': - fgets($this->fileHandle, 5) == "\xFF\xFE\x00\x00" ? - fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0); - - break; - case 'UTF-32BE': - fgets($this->fileHandle, 5) == "\x00\x00\xFE\xFF" ? - fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0); - - break; - default: break; } } @@ -275,10 +246,7 @@ class Csv extends BaseReader public function listWorksheetInfo($pFilename) { // Open file - if (!$this->canRead($pFilename)) { - throw new Exception($pFilename . ' is an Invalid Spreadsheet file.'); - } - $this->openFile($pFilename); + $this->openFileOrMemory($pFilename); $fileHandle = $this->fileHandle; // Skip BOM, if any @@ -324,6 +292,24 @@ class Csv extends BaseReader return $this->loadIntoExisting($pFilename, $spreadsheet); } + private function openFileOrMemory($pFilename) + { + // Open file + $fhandle = $this->canRead($pFilename); + if (!$fhandle) { + throw new Exception($pFilename . ' is an Invalid Spreadsheet file.'); + } + $this->openFile($pFilename); + if ($this->inputEncoding !== 'UTF-8') { + fclose($this->fileHandle); + $entireFile = file_get_contents($pFilename); + $this->fileHandle = fopen('php://memory', 'r+'); + $data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding); + fwrite($this->fileHandle, $data); + rewind($this->fileHandle); + } + } + /** * Loads PhpSpreadsheet from file into PhpSpreadsheet instance. * @@ -338,10 +324,7 @@ class Csv extends BaseReader ini_set('auto_detect_line_endings', true); // Open file - if (!$this->canRead($pFilename)) { - throw new Exception($pFilename . ' is an Invalid Spreadsheet file.'); - } - $this->openFile($pFilename); + $this->openFileOrMemory($pFilename); $fileHandle = $this->fileHandle; // Skip BOM, if any @@ -357,22 +340,24 @@ class Csv extends BaseReader // Set our starting row based on whether we're in contiguous mode or not $currentRow = 1; - if ($this->contiguous) { - $currentRow = ($this->contiguousRow == -1) ? $sheet->getHighestRow() : $this->contiguousRow; - } + $outRow = 0; // Loop through each line of the file in turn while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) { + $noOutputYet = true; $columnLetter = 'A'; foreach ($rowData as $rowDatum) { if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) { - // Convert encoding if necessary - if ($this->inputEncoding !== 'UTF-8') { - $rowDatum = StringHelper::convertEncoding($rowDatum, 'UTF-8', $this->inputEncoding); + if ($this->contiguous) { + if ($noOutputYet) { + $noOutputYet = false; + ++$outRow; + } + } else { + $outRow = $currentRow; } - // Set cell value - $sheet->getCell($columnLetter . $currentRow)->setValue($rowDatum); + $sheet->getCell($columnLetter . $outRow)->setValue($rowDatum); } ++$columnLetter; } @@ -382,10 +367,6 @@ class Csv extends BaseReader // Close file fclose($fileHandle); - if ($this->contiguous) { - $this->contiguousRow = $currentRow; - } - ini_set('auto_detect_line_endings', $lineEnding); // Return @@ -477,9 +458,6 @@ class Csv extends BaseReader public function setContiguous($contiguous) { $this->contiguous = (bool) $contiguous; - if (!$contiguous) { - $this->contiguousRow = -1; - } return $this; } @@ -530,7 +508,7 @@ class Csv extends BaseReader // Check if file exists try { $this->openFile($pFilename); - } catch (Exception $e) { + } catch (\InvalidArgumentException $e) { return false; } diff --git a/src/PhpSpreadsheet/Writer/Csv.php b/src/PhpSpreadsheet/Writer/Csv.php index 142f15b8..cabfe450 100644 --- a/src/PhpSpreadsheet/Writer/Csv.php +++ b/src/PhpSpreadsheet/Writer/Csv.php @@ -93,6 +93,8 @@ class Csv extends BaseWriter // Open file if (is_resource($pFilename)) { $fileHandle = $pFilename; + } elseif (!$pFilename) { + $fileHandle = false; } else { $fileHandle = fopen($pFilename, 'wb+'); } @@ -176,10 +178,7 @@ class Csv extends BaseWriter */ public function setEnclosure($pValue) { - if ($pValue == '') { - $pValue = null; - } - $this->enclosure = $pValue; + $this->enclosure = $pValue ? $pValue : '"'; return $this; } diff --git a/tests/PhpSpreadsheetTests/Reader/CsvContiguousFilter.php b/tests/PhpSpreadsheetTests/Reader/CsvContiguousFilter.php new file mode 100644 index 00000000..95f7e787 --- /dev/null +++ b/tests/PhpSpreadsheetTests/Reader/CsvContiguousFilter.php @@ -0,0 +1,57 @@ +startRow = $startRow; + $this->endRow = $startRow + $chunkSize; + } + + public function setFilterType($type) + { + $this->filterType = $type; + } + + public function filter1($row) + { + // Include rows 1-10, followed by 100-110, etc. + return $row % 100 <= 10; + } + + public function filter0($row) + { + // Only read the heading row, and the rows that are configured in $this->_startRow and $this->_endRow + if (($row == 1) || ($row >= $this->startRow && $row < $this->endRow)) { + return true; + } + + return false; + } + + public function readCell($column, $row, $worksheetName = '') + { + if ($this->filterType == 1) { + return $this->filter1($row); + } + + return $this->filter0($row); + } +} diff --git a/tests/PhpSpreadsheetTests/Reader/CsvContiguousTest.php b/tests/PhpSpreadsheetTests/Reader/CsvContiguousTest.php new file mode 100644 index 00000000..4fe2006d --- /dev/null +++ b/tests/PhpSpreadsheetTests/Reader/CsvContiguousTest.php @@ -0,0 +1,81 @@ +getContiguous()); + $reader->setReadFilter($chunkFilter) + ->setContiguous(true); + + // Instantiate a new PhpSpreadsheet object manually + $spreadsheet = new Spreadsheet(); + + // Set a sheet index + $sheet = 0; + // Loop to read our worksheet in "chunk size" blocks + /** $startRow is set to 2 initially because we always read the headings in row #1 * */ + for ($startRow = 2; $startRow <= 240; $startRow += $chunkSize) { + // Tell the Read Filter, the limits on which rows we want to read this iteration + $chunkFilter->setRows($startRow, $chunkSize); + + // Increment the worksheet index pointer for the Reader + $reader->setSheetIndex($sheet); + // Load only the rows that match our filter into a new worksheet in the PhpSpreadsheet Object + $reader->loadIntoExisting($this->inputFileName, $spreadsheet); + // Set the worksheet title (to reference the "sheet" of data that we've loaded) + // and increment the sheet index as well + $spreadsheet->getActiveSheet()->setTitle('Country Data #' . (++$sheet)); + } + + $sheet = $spreadsheet->getSheetByName('Country Data #1'); + self::assertEquals('Kabul', $sheet->getCell('A2')->getValue()); + $sheet = $spreadsheet->getSheetByName('Country Data #2'); + self::assertEquals('Lesotho', $sheet->getCell('B4')->getValue()); + $sheet = $spreadsheet->getSheetByName('Country Data #3'); + self::assertEquals(-20.1, $sheet->getCell('C6')->getValue()); + } + + public function testContiguous2() + { + // Create a new Reader of the type defined in $inputFileType + $reader = new Csv(); + + // Create a new Instance of our Read Filter + $chunkFilter = new CsvContiguousFilter(); + $chunkFilter->setFilterType(1); + + // Tell the Reader that we want to use the Read Filter that we've Instantiated + // and that we want to store it in contiguous rows/columns + $reader->setReadFilter($chunkFilter) + ->setContiguous(true); + + // Instantiate a new PhpSpreadsheet object manually + $spreadsheet = new Spreadsheet(); + + // Loop to read our worksheet in "chunk size" blocks + $reader->loadIntoExisting($this->inputFileName, $spreadsheet); + + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals('Kabul', $sheet->getCell('A2')->getValue()); + self::assertEquals('Kuwait', $sheet->getCell('B11')->getValue()); + } +} diff --git a/tests/PhpSpreadsheetTests/Reader/CsvTest.php b/tests/PhpSpreadsheetTests/Reader/CsvTest.php index be08f6a6..bb593ce8 100644 --- a/tests/PhpSpreadsheetTests/Reader/CsvTest.php +++ b/tests/PhpSpreadsheetTests/Reader/CsvTest.php @@ -3,6 +3,7 @@ namespace PhpOffice\PhpSpreadsheetTests\Reader; use PhpOffice\PhpSpreadsheet\Reader\Csv; +use PhpOffice\PhpSpreadsheet\Reader\Exception as ReaderException; use PHPUnit\Framework\TestCase; class CsvTest extends TestCase @@ -130,4 +131,113 @@ class CsvTest extends TestCase $this->assertSame('"', $reader->getEscapeCharacter()); $this->assertSame($expected, $worksheet->toArray()); } + + /** + * @dataProvider providerEncodings + * + * @param string $filename + * @param string $encoding + */ + public function testEncodings($filename, $encoding) + { + $reader = new Csv(); + $reader->setInputEncoding($encoding); + $spreadsheet = $reader->load($filename); + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals('Ã…', $sheet->getCell('A1')->getValue()); + } + + public function testInvalidWorkSheetInfo() + { + $this->expectException(ReaderException::class); + $reader = new Csv(); + $reader->listWorksheetInfo(''); + } + + /** + * @dataProvider providerEncodings + * + * @param string $filename + * @param string $encoding + */ + public function testWorkSheetInfo($filename, $encoding) + { + $reader = new Csv(); + $reader->setInputEncoding($encoding); + $info = $reader->listWorksheetInfo($filename); + self::assertEquals('Worksheet', $info[0]['worksheetName']); + self::assertEquals('B', $info[0]['lastColumnLetter']); + self::assertEquals(1, $info[0]['lastColumnIndex']); + self::assertEquals(2, $info[0]['totalRows']); + self::assertEquals(2, $info[0]['totalColumns']); + } + + public function providerEncodings() + { + return [ + ['data/Reader/CSV/encoding.iso88591.csv', 'ISO-8859-1'], + ['data/Reader/CSV/encoding.utf8.csv', 'UTF-8'], + ['data/Reader/CSV/encoding.utf8bom.csv', 'UTF-8'], + ['data/Reader/CSV/encoding.utf16be.csv', 'UTF-16BE'], + ['data/Reader/CSV/encoding.utf16le.csv', 'UTF-16LE'], + ['data/Reader/CSV/encoding.utf32be.csv', 'UTF-32BE'], + ['data/Reader/CSV/encoding.utf32le.csv', 'UTF-32LE'], + ]; + } + + public function testUtf16LineBreak() + { + $reader = new Csv(); + $reader->setInputEncoding('UTF-16BE'); + $spreadsheet = $reader->load('data/Reader/CSV/utf16be.line_break_in_enclosure.csv'); + $sheet = $spreadsheet->getActiveSheet(); + $expected = <<getCell('B3')->getValue()); + } + + public function testSeparatorLine() + { + $reader = new Csv(); + $reader->setSheetIndex(3); + $spreadsheet = $reader->load('data/Reader/CSV/sep.csv'); + self::assertEquals(';', $reader->getDelimiter()); + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals(3, $reader->getSheetIndex()); + self::assertEquals(3, $spreadsheet->getActiveSheetIndex()); + self::assertEquals('A', $sheet->getCell('A1')->getValue()); + self::assertEquals(1, $sheet->getCell('B1')->getValue()); + self::assertEquals(2, $sheet->getCell('A2')->getValue()); + self::assertEquals(3, $sheet->getCell('B2')->getValue()); + } + + public function testDefaultSettings() + { + $reader = new Csv(); + self::assertEquals('UTF-8', $reader->getInputEncoding()); + self::assertEquals('"', $reader->getEnclosure()); + $reader->setEnclosure('\''); + self::assertEquals('\'', $reader->getEnclosure()); + $reader->setEnclosure(''); + self::assertEquals('"', $reader->getEnclosure()); + } + + public function testReadEmptyFileName() + { + $this->expectException(ReaderException::class); + $reader = new Csv(); + $filename = ''; + $reader->load($filename); + } + + public function testReadNonexistentFileName() + { + $this->expectException(ReaderException::class); + $reader = new Csv(); + $reader->load('data/Reader/CSV/encoding.utf8.csvxxx'); + } } diff --git a/tests/PhpSpreadsheetTests/Writer/Csv/CsvWriteTest.php b/tests/PhpSpreadsheetTests/Writer/Csv/CsvWriteTest.php new file mode 100644 index 00000000..f55781da --- /dev/null +++ b/tests/PhpSpreadsheetTests/Writer/Csv/CsvWriteTest.php @@ -0,0 +1,60 @@ +getActiveSheet(); + $sheet->setCellValue('A1', 'First Sheet'); + $sheet = $spreadsheet->createSheet(); + $sheet->setCellValue('A1', 'Second Sheet'); + $sheet = $spreadsheet->createSheet(); + $sheet->setCellValue('A1', 'Third Sheet'); + $writer = new CsvWriter($spreadsheet); + $writer->setSheetIndex(1); + self::assertEquals(1, $writer->getSheetIndex()); + $filename = tempnam(File::sysGetTempDir(), 'phpspreadsheet-test'); + $writer->save($filename); + $reader = new CsvReader(); + $newspreadsheet = $reader->load($filename); + unlink($filename); + $sheet = $newspreadsheet->getActiveSheet(); + self::assertEquals('Second Sheet', $sheet->getCell('A1')->getValue()); + self::assertEquals(0, $newspreadsheet->getActiveSheetIndex()); + } + + public function testWriteEmptyFileName() + { + $this->expectException(WriterException::class); + $spreadsheet = new Spreadsheet(); + $writer = new CsvWriter($spreadsheet); + $filename = ''; + $writer->save($filename); + } + + public function testDefaultSettings() + { + $spreadsheet = new Spreadsheet(); + $writer = new CsvWriter($spreadsheet); + self::assertEquals('"', $writer->getEnclosure()); + $writer->setEnclosure('\''); + self::assertEquals('\'', $writer->getEnclosure()); + $writer->setEnclosure(''); + self::assertEquals('"', $writer->getEnclosure()); + self::assertEquals(PHP_EOL, $writer->getLineEnding()); + self::assertFalse($writer->getUseBOM()); + self::assertFalse($writer->getIncludeSeparatorLine()); + self::assertFalse($writer->getExcelCompatibility()); + self::assertEquals(0, $writer->getSheetIndex()); + } +} diff --git a/tests/data/Reader/CSV/encoding.iso88591.csv b/tests/data/Reader/CSV/encoding.iso88591.csv new file mode 100644 index 00000000..84ecfd91 --- /dev/null +++ b/tests/data/Reader/CSV/encoding.iso88591.csv @@ -0,0 +1,2 @@ +Å,1 +2,3 diff --git a/tests/data/Reader/CSV/encoding.utf16be.csv b/tests/data/Reader/CSV/encoding.utf16be.csv new file mode 100644 index 0000000000000000000000000000000000000000..8c9665ea01b13e99c5db4b27769138b2405fc7ab GIT binary patch literal 20 XcmZQL%Amtw$iU0M#b5+v8G~2=8M^`E literal 0 HcmV?d00001 diff --git a/tests/data/Reader/CSV/encoding.utf16le.csv b/tests/data/Reader/CSV/encoding.utf16le.csv new file mode 100644 index 0000000000000000000000000000000000000000..1299a8b9ae0a7fce544eeb57935b7c3081113483 GIT binary patch literal 20 WcmX@gpu=Fuz{|kJU<70tgINF@oB`qh literal 0 HcmV?d00001 diff --git a/tests/data/Reader/CSV/encoding.utf32be.csv b/tests/data/Reader/CSV/encoding.utf32be.csv new file mode 100644 index 0000000000000000000000000000000000000000..eaf28f699809360d614ebec0acfafd69c0cb4be8 GIT binary patch literal 40 hcmZQzU^vRaz@P)fhCs{<#9TmZ1jHaQV;}~L0RS{Xuyn0sB*ok!*n7KRm&Vyy1FE#{f z3UXF7v^+$6wyc>lVJFiO&CSFm7hl=ngD4Js()Q+~nv6