diff --git a/src/PhpSpreadsheet/Shared/CodePage.php b/src/PhpSpreadsheet/Shared/CodePage.php index b395293c..97cbfbbe 100644 --- a/src/PhpSpreadsheet/Shared/CodePage.php +++ b/src/PhpSpreadsheet/Shared/CodePage.php @@ -6,6 +6,65 @@ use PhpOffice\PhpSpreadsheet\Exception as PhpSpreadsheetException; class CodePage { + private static $pageArray = [ + 0 => 'CP1252', // CodePage is not always correctly set when the xls file was saved by Apple's Numbers program + 367 => 'ASCII', // ASCII + 437 => 'CP437', // OEM US + //720 => 'notsupported', // OEM Arabic + 737 => 'CP737', // OEM Greek + 775 => 'CP775', // OEM Baltic + 850 => 'CP850', // OEM Latin I + 852 => 'CP852', // OEM Latin II (Central European) + 855 => 'CP855', // OEM Cyrillic + 857 => 'CP857', // OEM Turkish + 858 => 'CP858', // OEM Multilingual Latin I with Euro + 860 => 'CP860', // OEM Portugese + 861 => 'CP861', // OEM Icelandic + 862 => 'CP862', // OEM Hebrew + 863 => 'CP863', // OEM Canadian (French) + 864 => 'CP864', // OEM Arabic + 865 => 'CP865', // OEM Nordic + 866 => 'CP866', // OEM Cyrillic (Russian) + 869 => 'CP869', // OEM Greek (Modern) + 874 => 'CP874', // ANSI Thai + 932 => 'CP932', // ANSI Japanese Shift-JIS + 936 => 'CP936', // ANSI Chinese Simplified GBK + 949 => 'CP949', // ANSI Korean (Wansung) + 950 => 'CP950', // ANSI Chinese Traditional BIG5 + 1200 => 'UTF-16LE', // UTF-16 (BIFF8) + 1250 => 'CP1250', // ANSI Latin II (Central European) + 1251 => 'CP1251', // ANSI Cyrillic + 1252 => 'CP1252', // ANSI Latin I (BIFF4-BIFF7) + 1253 => 'CP1253', // ANSI Greek + 1254 => 'CP1254', // ANSI Turkish + 1255 => 'CP1255', // ANSI Hebrew + 1256 => 'CP1256', // ANSI Arabic + 1257 => 'CP1257', // ANSI Baltic + 1258 => 'CP1258', // ANSI Vietnamese + 1361 => 'CP1361', // ANSI Korean (Johab) + 10000 => 'MAC', // Apple Roman + 10001 => 'CP932', // Macintosh Japanese + 10002 => 'CP950', // Macintosh Chinese Traditional + 10003 => 'CP1361', // Macintosh Korean + 10004 => 'MACARABIC', // Apple Arabic + 10005 => 'MACHEBREW', // Apple Hebrew + 10006 => 'MACGREEK', // Macintosh Greek + 10007 => 'MACCYRILLIC', // Macintosh Cyrillic + 10008 => 'CP936', // Macintosh - Simplified Chinese (GB 2312) + 10010 => 'MACROMANIA', // Macintosh Romania + 10017 => 'MACUKRAINE', // Macintosh Ukraine + 10021 => 'MACTHAI', // Macintosh Thai + 10029 => 'MACCENTRALEUROPE', // Macintosh Central Europe + 10079 => 'MACICELAND', // Macintosh Icelandic + 10081 => 'MACTURKISH', // Macintosh Turkish + 10082 => 'MACCROATIAN', // Macintosh Croatian + 21010 => 'UTF-16LE', // UTF-16 (BIFF8) This isn't correct, but some Excel writer libraries erroneously use Codepage 21010 for UTF-16LE + 32768 => 'MAC', // Apple Roman + //32769 => 'unsupported', // ANSI Latin I (BIFF2-BIFF3) + 65000 => 'UTF-7', // Unicode (UTF-7) + 65001 => 'UTF-8', // Unicode (UTF-8) + ]; + /** * Convert Microsoft Code Page Identifier to Code Page Name which iconv * and mbstring understands. @@ -14,123 +73,20 @@ class CodePage * * @return string Code Page Name */ - public static function numberToName($codePage) + public static function numberToName(int $codePage): string { - switch ($codePage) { - case 367: - return 'ASCII'; // ASCII - case 437: - return 'CP437'; // OEM US - case 720: - throw new PhpSpreadsheetException('Code page 720 not supported.'); // OEM Arabic - case 737: - return 'CP737'; // OEM Greek - case 775: - return 'CP775'; // OEM Baltic - case 850: - return 'CP850'; // OEM Latin I - case 852: - return 'CP852'; // OEM Latin II (Central European) - case 855: - return 'CP855'; // OEM Cyrillic - case 857: - return 'CP857'; // OEM Turkish - case 858: - return 'CP858'; // OEM Multilingual Latin I with Euro - case 860: - return 'CP860'; // OEM Portugese - case 861: - return 'CP861'; // OEM Icelandic - case 862: - return 'CP862'; // OEM Hebrew - case 863: - return 'CP863'; // OEM Canadian (French) - case 864: - return 'CP864'; // OEM Arabic - case 865: - return 'CP865'; // OEM Nordic - case 866: - return 'CP866'; // OEM Cyrillic (Russian) - case 869: - return 'CP869'; // OEM Greek (Modern) - case 874: - return 'CP874'; // ANSI Thai - case 932: - return 'CP932'; // ANSI Japanese Shift-JIS - case 936: - return 'CP936'; // ANSI Chinese Simplified GBK - case 949: - return 'CP949'; // ANSI Korean (Wansung) - case 950: - return 'CP950'; // ANSI Chinese Traditional BIG5 - case 1200: - return 'UTF-16LE'; // UTF-16 (BIFF8) - case 1250: - return 'CP1250'; // ANSI Latin II (Central European) - case 1251: - return 'CP1251'; // ANSI Cyrillic - case 0: - // CodePage is not always correctly set when the xls file was saved by Apple's Numbers program - case 1252: - return 'CP1252'; // ANSI Latin I (BIFF4-BIFF7) - case 1253: - return 'CP1253'; // ANSI Greek - case 1254: - return 'CP1254'; // ANSI Turkish - case 1255: - return 'CP1255'; // ANSI Hebrew - case 1256: - return 'CP1256'; // ANSI Arabic - case 1257: - return 'CP1257'; // ANSI Baltic - case 1258: - return 'CP1258'; // ANSI Vietnamese - case 1361: - return 'CP1361'; // ANSI Korean (Johab) - case 10000: - return 'MAC'; // Apple Roman - case 10001: - return 'CP932'; // Macintosh Japanese - case 10002: - return 'CP950'; // Macintosh Chinese Traditional - case 10003: - return 'CP1361'; // Macintosh Korean - case 10004: - return 'MACARABIC'; // Apple Arabic - case 10005: - return 'MACHEBREW'; // Apple Hebrew - case 10006: - return 'MACGREEK'; // Macintosh Greek - case 10007: - return 'MACCYRILLIC'; // Macintosh Cyrillic - case 10008: - return 'CP936'; // Macintosh - Simplified Chinese (GB 2312) - case 10010: - return 'MACROMANIA'; // Macintosh Romania - case 10017: - return 'MACUKRAINE'; // Macintosh Ukraine - case 10021: - return 'MACTHAI'; // Macintosh Thai - case 10029: - return 'MACCENTRALEUROPE'; // Macintosh Central Europe - case 10079: - return 'MACICELAND'; // Macintosh Icelandic - case 10081: - return 'MACTURKISH'; // Macintosh Turkish - case 10082: - return 'MACCROATIAN'; // Macintosh Croatian - case 21010: - return 'UTF-16LE'; // UTF-16 (BIFF8) This isn't correct, but some Excel writer libraries erroneously use Codepage 21010 for UTF-16LE - case 32768: - return 'MAC'; // Apple Roman - case 32769: - throw new PhpSpreadsheetException('Code page 32769 not supported.'); // ANSI Latin I (BIFF2-BIFF3) - case 65000: - return 'UTF-7'; // Unicode (UTF-7) - case 65001: - return 'UTF-8'; // Unicode (UTF-8) + if (array_key_exists($codePage, self::$pageArray)) { + return self::$pageArray[$codePage]; + } + if ($codePage == 720 || $codePage == 32769) { + throw new PhpSpreadsheetException("Code page $codePage not supported."); // OEM Arabic } throw new PhpSpreadsheetException('Unknown codepage: ' . $codePage); } + + public static function getEncodings(): array + { + return self::$pageArray; + } } diff --git a/tests/PhpSpreadsheetTests/Shared/CodePageTest.php b/tests/PhpSpreadsheetTests/Shared/CodePageTest.php index b86f9015..2bdbda72 100644 --- a/tests/PhpSpreadsheetTests/Shared/CodePageTest.php +++ b/tests/PhpSpreadsheetTests/Shared/CodePageTest.php @@ -24,6 +24,22 @@ class CodePageTest extends TestCase return require 'tests/data/Shared/CodePage.php'; } + public function testCoverage(): void + { + $covered = []; + $expected = CodePage::getEncodings(); + foreach ($expected as $key => $val) { + $covered[$key] = 0; + } + $tests = $this->providerCodePage(); + foreach ($tests as $test) { + $covered[$test[1]] = 1; + } + foreach ($covered as $key => $val) { + self::assertEquals(1, $val, "Codepage $key not tested"); + } + } + public function testNumberToNameWithInvalidCodePage(): void { $invalidCodePage = 12345; diff --git a/tests/data/Shared/CodePage.php b/tests/data/Shared/CodePage.php index 1cf09d88..82bb23e4 100644 --- a/tests/data/Shared/CodePage.php +++ b/tests/data/Shared/CodePage.php @@ -1,6 +1,11 @@