Code Coverage for Shared\CodePage (#1491)

While investigating something else in Shared, I noticed that CodePage
had poor test coverage and a high complexity rating. This change
addresses both; Scrutinizer would love it, although its interface on
GitHub seems broken at the moment (all PRs show "Waiting for External
Code Coverage").
This commit is contained in:
oleibman 2020-05-24 03:51:28 -07:00 committed by GitHub
parent 8ca7bfe53c
commit 84e03da5c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 146 additions and 119 deletions

View File

@ -6,6 +6,65 @@ use PhpOffice\PhpSpreadsheet\Exception as PhpSpreadsheetException;
class CodePage
{
private static $pageArray = [
0 => 'CP1252', // CodePage is not always correctly set when the xls file was saved by Apple's Numbers program
367 => 'ASCII', // ASCII
437 => 'CP437', // OEM US
//720 => 'notsupported', // OEM Arabic
737 => 'CP737', // OEM Greek
775 => 'CP775', // OEM Baltic
850 => 'CP850', // OEM Latin I
852 => 'CP852', // OEM Latin II (Central European)
855 => 'CP855', // OEM Cyrillic
857 => 'CP857', // OEM Turkish
858 => 'CP858', // OEM Multilingual Latin I with Euro
860 => 'CP860', // OEM Portugese
861 => 'CP861', // OEM Icelandic
862 => 'CP862', // OEM Hebrew
863 => 'CP863', // OEM Canadian (French)
864 => 'CP864', // OEM Arabic
865 => 'CP865', // OEM Nordic
866 => 'CP866', // OEM Cyrillic (Russian)
869 => 'CP869', // OEM Greek (Modern)
874 => 'CP874', // ANSI Thai
932 => 'CP932', // ANSI Japanese Shift-JIS
936 => 'CP936', // ANSI Chinese Simplified GBK
949 => 'CP949', // ANSI Korean (Wansung)
950 => 'CP950', // ANSI Chinese Traditional BIG5
1200 => 'UTF-16LE', // UTF-16 (BIFF8)
1250 => 'CP1250', // ANSI Latin II (Central European)
1251 => 'CP1251', // ANSI Cyrillic
1252 => 'CP1252', // ANSI Latin I (BIFF4-BIFF7)
1253 => 'CP1253', // ANSI Greek
1254 => 'CP1254', // ANSI Turkish
1255 => 'CP1255', // ANSI Hebrew
1256 => 'CP1256', // ANSI Arabic
1257 => 'CP1257', // ANSI Baltic
1258 => 'CP1258', // ANSI Vietnamese
1361 => 'CP1361', // ANSI Korean (Johab)
10000 => 'MAC', // Apple Roman
10001 => 'CP932', // Macintosh Japanese
10002 => 'CP950', // Macintosh Chinese Traditional
10003 => 'CP1361', // Macintosh Korean
10004 => 'MACARABIC', // Apple Arabic
10005 => 'MACHEBREW', // Apple Hebrew
10006 => 'MACGREEK', // Macintosh Greek
10007 => 'MACCYRILLIC', // Macintosh Cyrillic
10008 => 'CP936', // Macintosh - Simplified Chinese (GB 2312)
10010 => 'MACROMANIA', // Macintosh Romania
10017 => 'MACUKRAINE', // Macintosh Ukraine
10021 => 'MACTHAI', // Macintosh Thai
10029 => 'MACCENTRALEUROPE', // Macintosh Central Europe
10079 => 'MACICELAND', // Macintosh Icelandic
10081 => 'MACTURKISH', // Macintosh Turkish
10082 => 'MACCROATIAN', // Macintosh Croatian
21010 => 'UTF-16LE', // UTF-16 (BIFF8) This isn't correct, but some Excel writer libraries erroneously use Codepage 21010 for UTF-16LE
32768 => 'MAC', // Apple Roman
//32769 => 'unsupported', // ANSI Latin I (BIFF2-BIFF3)
65000 => 'UTF-7', // Unicode (UTF-7)
65001 => 'UTF-8', // Unicode (UTF-8)
];
/**
* Convert Microsoft Code Page Identifier to Code Page Name which iconv
* and mbstring understands.
@ -14,123 +73,20 @@ class CodePage
*
* @return string Code Page Name
*/
public static function numberToName($codePage)
public static function numberToName(int $codePage): string
{
switch ($codePage) {
case 367:
return 'ASCII'; // ASCII
case 437:
return 'CP437'; // OEM US
case 720:
throw new PhpSpreadsheetException('Code page 720 not supported.'); // OEM Arabic
case 737:
return 'CP737'; // OEM Greek
case 775:
return 'CP775'; // OEM Baltic
case 850:
return 'CP850'; // OEM Latin I
case 852:
return 'CP852'; // OEM Latin II (Central European)
case 855:
return 'CP855'; // OEM Cyrillic
case 857:
return 'CP857'; // OEM Turkish
case 858:
return 'CP858'; // OEM Multilingual Latin I with Euro
case 860:
return 'CP860'; // OEM Portugese
case 861:
return 'CP861'; // OEM Icelandic
case 862:
return 'CP862'; // OEM Hebrew
case 863:
return 'CP863'; // OEM Canadian (French)
case 864:
return 'CP864'; // OEM Arabic
case 865:
return 'CP865'; // OEM Nordic
case 866:
return 'CP866'; // OEM Cyrillic (Russian)
case 869:
return 'CP869'; // OEM Greek (Modern)
case 874:
return 'CP874'; // ANSI Thai
case 932:
return 'CP932'; // ANSI Japanese Shift-JIS
case 936:
return 'CP936'; // ANSI Chinese Simplified GBK
case 949:
return 'CP949'; // ANSI Korean (Wansung)
case 950:
return 'CP950'; // ANSI Chinese Traditional BIG5
case 1200:
return 'UTF-16LE'; // UTF-16 (BIFF8)
case 1250:
return 'CP1250'; // ANSI Latin II (Central European)
case 1251:
return 'CP1251'; // ANSI Cyrillic
case 0:
// CodePage is not always correctly set when the xls file was saved by Apple's Numbers program
case 1252:
return 'CP1252'; // ANSI Latin I (BIFF4-BIFF7)
case 1253:
return 'CP1253'; // ANSI Greek
case 1254:
return 'CP1254'; // ANSI Turkish
case 1255:
return 'CP1255'; // ANSI Hebrew
case 1256:
return 'CP1256'; // ANSI Arabic
case 1257:
return 'CP1257'; // ANSI Baltic
case 1258:
return 'CP1258'; // ANSI Vietnamese
case 1361:
return 'CP1361'; // ANSI Korean (Johab)
case 10000:
return 'MAC'; // Apple Roman
case 10001:
return 'CP932'; // Macintosh Japanese
case 10002:
return 'CP950'; // Macintosh Chinese Traditional
case 10003:
return 'CP1361'; // Macintosh Korean
case 10004:
return 'MACARABIC'; // Apple Arabic
case 10005:
return 'MACHEBREW'; // Apple Hebrew
case 10006:
return 'MACGREEK'; // Macintosh Greek
case 10007:
return 'MACCYRILLIC'; // Macintosh Cyrillic
case 10008:
return 'CP936'; // Macintosh - Simplified Chinese (GB 2312)
case 10010:
return 'MACROMANIA'; // Macintosh Romania
case 10017:
return 'MACUKRAINE'; // Macintosh Ukraine
case 10021:
return 'MACTHAI'; // Macintosh Thai
case 10029:
return 'MACCENTRALEUROPE'; // Macintosh Central Europe
case 10079:
return 'MACICELAND'; // Macintosh Icelandic
case 10081:
return 'MACTURKISH'; // Macintosh Turkish
case 10082:
return 'MACCROATIAN'; // Macintosh Croatian
case 21010:
return 'UTF-16LE'; // UTF-16 (BIFF8) This isn't correct, but some Excel writer libraries erroneously use Codepage 21010 for UTF-16LE
case 32768:
return 'MAC'; // Apple Roman
case 32769:
throw new PhpSpreadsheetException('Code page 32769 not supported.'); // ANSI Latin I (BIFF2-BIFF3)
case 65000:
return 'UTF-7'; // Unicode (UTF-7)
case 65001:
return 'UTF-8'; // Unicode (UTF-8)
if (array_key_exists($codePage, self::$pageArray)) {
return self::$pageArray[$codePage];
}
if ($codePage == 720 || $codePage == 32769) {
throw new PhpSpreadsheetException("Code page $codePage not supported."); // OEM Arabic
}
throw new PhpSpreadsheetException('Unknown codepage: ' . $codePage);
}
public static function getEncodings(): array
{
return self::$pageArray;
}
}

View File

@ -24,6 +24,22 @@ class CodePageTest extends TestCase
return require 'tests/data/Shared/CodePage.php';
}
public function testCoverage(): void
{
$covered = [];
$expected = CodePage::getEncodings();
foreach ($expected as $key => $val) {
$covered[$key] = 0;
}
$tests = $this->providerCodePage();
foreach ($tests as $test) {
$covered[$test[1]] = 1;
}
foreach ($covered as $key => $val) {
self::assertEquals(1, $val, "Codepage $key not tested");
}
}
public function testNumberToNameWithInvalidCodePage(): void
{
$invalidCodePage = 12345;

View File

@ -1,6 +1,11 @@
<?php
return [
// ANSI Latin I (BIFF4-BIFF7)
[
'CP1252',
0,
],
// ASCII
[
'ASCII',
@ -127,11 +132,6 @@ return [
1251,
],
// ANSI Latin I (BIFF4-BIFF7)
[
'CP1252',
0,
],
// ANSI Latin I (BIFF4-BIFF7)
[
'CP1252',
1252,
@ -176,6 +176,31 @@ return [
'MAC',
10000,
],
// Macintosh Japanese
[
'CP932',
10001,
],
// Macintosh Chinese Traditional
[
'CP950',
10002,
],
// Macintosh Korean
[
'CP1361',
10003,
],
// Apple Arabic
[
'MACARABIC',
10004,
],
// Apple Hebrew
[
'MACHEBREW',
10005,
],
// Macintosh Greek
[
'MACGREEK',
@ -186,6 +211,26 @@ return [
'MACCYRILLIC',
10007,
],
// Macintosh - Simplified Chinese (GB 2312)
[
'CP936',
10008,
],
// Macintosh Romania
[
'MACROMANIA',
10010,
],
// Macintosh Ukraine
[
'MACUKRAINE',
10017,
],
// Macintosh Thai
[
'MACTHAI',
10021,
],
// Macintosh Central Europe
[
'MACCENTRALEUROPE',
@ -201,6 +246,16 @@ return [
'MACTURKISH',
10081,
],
// Macintosh Croatian
[
'MACCROATIAN',
10082,
],
// UTF-16 (BIFF8) grandfathers erroneous libraries
[
'UTF-16LE',
21010,
],
// Apple Roman
[
'MAC',