[Feature] Html reader improvements (#884)

* Extract character set, so we can convert to UTF-8 if required

* Set column width and row height when defined on tr/td

* Parse align and valign on td

* Specify number format of cell via html attribute

* Formatting of b, strong, i and em tags

* Inserting image in cell when using img tag in html

* Add applying inline styles: border, fonts, alignment, dimensions

* Add tests for applying inline styles
This commit is contained in:
Patrick Brouwers 2019-02-16 23:11:16 +01:00 committed by Mark Baker
parent 11575ef3c4
commit 1c99f4999c
4 changed files with 551 additions and 28 deletions

View File

@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com)
and this project adheres to [Semantic Versioning](https://semver.org).
## [Unreleased] -
### Added
- Added support for inline styles in Html reader (borders, alignment, width, height)
## [1.6.0] - 2019-01-02
### Added

View File

@ -12,6 +12,9 @@ use PhpOffice\PhpSpreadsheet\Spreadsheet;
use PhpOffice\PhpSpreadsheet\Style\Border;
use PhpOffice\PhpSpreadsheet\Style\Color;
use PhpOffice\PhpSpreadsheet\Style\Fill;
use PhpOffice\PhpSpreadsheet\Style\Font;
use PhpOffice\PhpSpreadsheet\Style\Style;
use PhpOffice\PhpSpreadsheet\Worksheet\Drawing;
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
/** PhpSpreadsheet root directory */
@ -96,6 +99,26 @@ class Html extends BaseReader
],
],
], // Bottom border
'strong' => [
'font' => [
'bold' => true,
],
], // Bold
'b' => [
'font' => [
'bold' => true,
],
], // Bold
'i' => [
'font' => [
'italic' => true,
],
], // Italic
'em' => [
'font' => [
'italic' => true,
],
], // Italic
];
protected $rowspan = [];
@ -295,11 +318,9 @@ class Html extends BaseReader
switch ($child->nodeName) {
case 'meta':
foreach ($attributeArray as $attributeName => $attributeValue) {
switch ($attributeName) {
case 'content':
// TODO
// Extract character set, so we can convert to UTF-8 if required
break;
if ($attributeName === 'charset') {
$this->setInputEncoding($attributeValue);
}
}
$this->processDomElement($child, $sheet, $row, $column, $cellContent);
@ -334,6 +355,10 @@ class Html extends BaseReader
$cellContent .= ' ';
}
if (isset($this->formats[$child->nodeName])) {
$sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
}
break;
case 'hr':
$this->flushCell($sheet, $column, $row, $cellContent);
@ -423,6 +448,10 @@ class Html extends BaseReader
$column = 'A';
}
break;
case 'img':
$this->insertImage($sheet, $column, $row, $attributeArray);
break;
case 'table':
$this->flushCell($sheet, $column, $row, $cellContent);
@ -448,6 +477,11 @@ class Html extends BaseReader
$column = $this->getTableStartColumn();
$cellContent = '';
$this->processDomElement($child, $sheet, $row, $column, $cellContent);
if (isset($attributeArray['height'])) {
$sheet->getRowDimension($row)->setRowHeight($attributeArray['height']);
}
++$row;
break;
@ -501,6 +535,27 @@ class Html extends BaseReader
]
);
}
if (isset($attributeArray['width'])) {
$sheet->getColumnDimension($column)->setWidth($attributeArray['width']);
}
if (isset($attributeArray['height'])) {
$sheet->getRowDimension($row)->setRowHeight($attributeArray['height']);
}
if (isset($attributeArray['align'])) {
$sheet->getStyle($column . $row)->getAlignment()->setHorizontal($attributeArray['align']);
}
if (isset($attributeArray['valign'])) {
$sheet->getStyle($column . $row)->getAlignment()->setVertical($attributeArray['valign']);
}
if (isset($attributeArray['data-format'])) {
$sheet->getStyle($column . $row)->getNumberFormat()->setFormatCode($attributeArray['data-format']);
}
++$column;
break;
@ -608,36 +663,271 @@ class Html extends BaseReader
return;
}
$supported_styles = ['background-color', 'color'];
$cellStyle = $sheet->getStyle($column . $row);
// add color styles (background & text) from dom element,currently support : td & th, using ONLY inline css style with RGB color
$styles = explode(';', $attributeArray['style']);
foreach ($styles as $st) {
$value = explode(':', $st);
$styleName = isset($value[0]) ? trim($value[0]) : null;
$styleValue = isset($value[1]) ? trim($value[1]) : null;
if (empty(trim($value[0])) || !in_array(trim($value[0]), $supported_styles)) {
if (!$styleName) {
continue;
}
//check if has #, so we can get clean hex
if (substr(trim($value[1]), 0, 1) == '#') {
$style_color = substr(trim($value[1]), 1);
}
if (empty($style_color)) {
continue;
}
switch (trim($value[0])) {
switch ($styleName) {
case 'background':
case 'background-color':
$sheet->getStyle($column . $row)->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => "{$style_color}"]]]);
$styleColor = $this->getStyleColor($styleValue);
if (!$styleColor) {
continue 2;
}
$cellStyle->applyFromArray(['fill' => ['fillType' => Fill::FILL_SOLID, 'color' => ['rgb' => $styleColor]]]);
break;
case 'color':
$sheet->getStyle($column . $row)->applyFromArray(['font' => ['color' => ['rgb' => "{$style_color}"]]]);
$styleColor = $this->getStyleColor($styleValue);
if (!$styleColor) {
continue 2;
}
$cellStyle->applyFromArray(['font' => ['color' => ['rgb' => $styleColor]]]);
break;
case 'border':
$this->setBorderStyle($cellStyle, $styleValue, 'allBorders');
break;
case 'border-top':
$this->setBorderStyle($cellStyle, $styleValue, 'top');
break;
case 'border-bottom':
$this->setBorderStyle($cellStyle, $styleValue, 'bottom');
break;
case 'border-left':
$this->setBorderStyle($cellStyle, $styleValue, 'left');
break;
case 'border-right':
$this->setBorderStyle($cellStyle, $styleValue, 'right');
break;
case 'font-size':
$cellStyle->getFont()->setSize(
(float) $styleValue
);
break;
case 'font-weight':
if ($styleValue === 'bold' || $styleValue >= 500) {
$cellStyle->getFont()->setBold(true);
}
break;
case 'font-style':
if ($styleValue === 'italic') {
$cellStyle->getFont()->setItalic(true);
}
break;
case 'font-family':
$cellStyle->getFont()->setName(str_replace('\'', '', $styleValue));
break;
case 'text-decoration':
switch ($styleValue) {
case 'underline':
$cellStyle->getFont()->setUnderline(Font::UNDERLINE_SINGLE);
break;
case 'line-through':
$cellStyle->getFont()->setStrikethrough(true);
break;
}
break;
case 'text-align':
$cellStyle->getAlignment()->setHorizontal($styleValue);
break;
case 'vertical-align':
$cellStyle->getAlignment()->setVertical($styleValue);
break;
case 'width':
$sheet->getColumnDimension($column)->setWidth(
str_replace('px', '', $styleValue)
);
break;
case 'height':
$sheet->getRowDimension($row)->setRowHeight(
str_replace('px', '', $styleValue)
);
break;
case 'word-wrap':
$cellStyle->getAlignment()->setWrapText(
$styleValue === 'break-word'
);
break;
case 'text-indent':
$cellStyle->getAlignment()->setIndent(
(int) str_replace(['px'], '', $styleValue)
);
break;
}
}
}
/**
* Check if has #, so we can get clean hex.
*
* @param $value
*
* @return null|string
*/
public function getStyleColor($value)
{
if (strpos($value, '#') === 0) {
return substr($value, 1);
}
return null;
}
/**
* @param Worksheet $sheet
* @param string $column
* @param int $row
* @param array $attributes
*
* @throws \PhpOffice\PhpSpreadsheet\Exception
*/
private function insertImage(Worksheet $sheet, $column, $row, array $attributes)
{
if (!isset($attributes['src'])) {
return;
}
$src = urldecode($attributes['src']);
$width = isset($attributes['width']) ? (float) $attributes['width'] : null;
$height = isset($attributes['height']) ? (float) $attributes['height'] : null;
$name = isset($attributes['alt']) ? (float) $attributes['alt'] : null;
$drawing = new Drawing();
$drawing->setPath($src);
$drawing->setWorksheet($sheet);
$drawing->setCoordinates($column . $row);
$drawing->setOffsetX(0);
$drawing->setOffsetY(10);
$drawing->setResizeProportional(true);
if ($name) {
$drawing->setName($name);
}
if ($width) {
$drawing->setWidth((int) $width);
}
if ($height) {
$drawing->setHeight((int) $height);
}
$sheet->getColumnDimension($column)->setWidth(
$drawing->getWidth() / 6
);
$sheet->getRowDimension($row)->setRowHeight(
$drawing->getHeight() * 0.9
);
}
/**
* Map html border style to PhpSpreadsheet border style.
*
* @param string $style
*
* @return null|string
*/
public function getBorderStyle($style)
{
switch ($style) {
case 'solid':
return Border::BORDER_THIN;
case 'dashed':
return Border::BORDER_DASHED;
case 'dotted':
return Border::BORDER_DOTTED;
case 'medium':
return Border::BORDER_MEDIUM;
case 'thick':
return Border::BORDER_THICK;
case 'none':
return Border::BORDER_NONE;
case 'dash-dot':
return Border::BORDER_DASHDOT;
case 'dash-dot-dot':
return Border::BORDER_DASHDOTDOT;
case 'double':
return Border::BORDER_DOUBLE;
case 'hair':
return Border::BORDER_HAIR;
case 'medium-dash-dot':
return Border::BORDER_MEDIUMDASHDOT;
case 'medium-dash-dot-dot':
return Border::BORDER_MEDIUMDASHDOTDOT;
case 'medium-dashed':
return Border::BORDER_MEDIUMDASHED;
case 'slant-dash-dot':
return Border::BORDER_SLANTDASHDOT;
}
return null;
}
/**
* @param Style $cellStyle
* @param string $styleValue
* @param string $type
*/
private function setBorderStyle(Style $cellStyle, $styleValue, $type)
{
list(, $borderStyle, $color) = explode(' ', $styleValue);
$cellStyle->applyFromArray([
'borders' => [
$type => [
'borderStyle' => $this->getBorderStyle($borderStyle),
'color' => ['rgb' => $this->getStyleColor($color)],
],
],
]);
}
}

View File

@ -3,6 +3,10 @@
namespace PhpOffice\PhpSpreadsheetTests\Reader;
use PhpOffice\PhpSpreadsheet\Reader\Html;
use PhpOffice\PhpSpreadsheet\Style\Alignment;
use PhpOffice\PhpSpreadsheet\Style\Border;
use PhpOffice\PhpSpreadsheet\Style\Font;
use PhpOffice\PhpSpreadsheet\Worksheet\Drawing;
use PHPUnit\Framework\TestCase;
class HtmlTest extends TestCase
@ -34,14 +38,13 @@ class HtmlTest extends TestCase
*/
public function testCanReadVerySmallFile($expected, $content)
{
$filename = tempnam(sys_get_temp_dir(), 'html');
file_put_contents($filename, $content);
$filename = $this->createHtml($content);
$reader = new Html();
$actual = $reader->canRead($filename);
unlink($filename);
self::assertSame($expected, $actual);
unlink($filename);
}
public function testBackgroundColorInRanding()
@ -51,14 +54,238 @@ class HtmlTest extends TestCase
<td style="background-color: #000000;color: #FFFFFF">Blue background</td>
</tr>
</table>';
$filename = tempnam(sys_get_temp_dir(), 'html');
file_put_contents($filename, $html);
$reader = new Html();
$spreadsheet = $reader->load($filename);
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$style = $firstSheet->getCell('A1')->getStyle();
self::assertEquals('FFFFFF', $style->getFont()->getColor()->getRGB());
unlink($filename);
}
public function testCanApplyInlineBordersStyles()
{
$html = '<table>
<tr>
<td style="border: 1px solid #333333;">Thin border</td>
<td style="border-bottom: 1px solid #333333;">Border bottom</td>
<td style="border-top: 1px solid #333333;">Border top</td>
<td style="border-left: 1px solid #333333;">Border left</td>
<td style="border-right: 1px solid #333333;">Border right</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$style = $firstSheet->getCell('A1')->getStyle();
$borders = $style->getBorders();
/** @var Border $border */
foreach ([$borders->getTop(), $borders->getBottom(), $borders->getLeft(), $borders->getRight()] as $border) {
self::assertEquals('333333', $border->getColor()->getRGB());
self::assertEquals(Border::BORDER_THIN, $border->getBorderStyle());
}
$style = $firstSheet->getCell('B1')->getStyle();
$border = $style->getBorders()->getBottom();
self::assertEquals('333333', $border->getColor()->getRGB());
self::assertEquals(Border::BORDER_THIN, $border->getBorderStyle());
$style = $firstSheet->getCell('C1')->getStyle();
$border = $style->getBorders()->getTop();
self::assertEquals('333333', $border->getColor()->getRGB());
self::assertEquals(Border::BORDER_THIN, $border->getBorderStyle());
$style = $firstSheet->getCell('D1')->getStyle();
$border = $style->getBorders()->getLeft();
self::assertEquals('333333', $border->getColor()->getRGB());
self::assertEquals(Border::BORDER_THIN, $border->getBorderStyle());
$style = $firstSheet->getCell('E1')->getStyle();
$border = $style->getBorders()->getRight();
self::assertEquals('333333', $border->getColor()->getRGB());
self::assertEquals(Border::BORDER_THIN, $border->getBorderStyle());
unlink($filename);
}
public function testCanApplyInlineFontStyles()
{
$html = '<table>
<tr>
<td style="font-size: 16px;">16px</td>
<td style="font-family: \'Times New Roman\'">Times New Roman</td>
<td style="font-weight: bold;">Bold</td>
<td style="font-style: italic;">Italic</td>
<td style="text-decoration: underline;">Underline</td>
<td style="text-decoration: line-through;">Line through</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$style = $firstSheet->getCell('A1')->getStyle();
self::assertEquals(16, $style->getFont()->getSize());
$style = $firstSheet->getCell('B1')->getStyle();
self::assertEquals('Times New Roman', $style->getFont()->getName());
$style = $firstSheet->getCell('C1')->getStyle();
self::assertTrue($style->getFont()->getBold());
$style = $firstSheet->getCell('D1')->getStyle();
self::assertTrue($style->getFont()->getItalic());
$style = $firstSheet->getCell('E1')->getStyle();
self::assertEquals(Font::UNDERLINE_SINGLE, $style->getFont()->getUnderline());
$style = $firstSheet->getCell('F1')->getStyle();
self::assertTrue($style->getFont()->getStrikethrough());
unlink($filename);
}
public function testCanApplyInlineWidth()
{
$html = '<table>
<tr>
<td width="50">50px</td>
<td style="width: 100px;">100px</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$dimension = $firstSheet->getColumnDimension('A');
self::assertEquals(50, $dimension->getWidth());
$dimension = $firstSheet->getColumnDimension('B');
self::assertEquals(100, $dimension->getWidth());
unlink($filename);
}
public function testCanApplyInlineHeight()
{
$html = '<table>
<tr>
<td height="50">1</td>
</tr>
<tr>
<td style="height: 100px;">2</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$dimension = $firstSheet->getRowDimension(1);
self::assertEquals(50, $dimension->getRowHeight());
$dimension = $firstSheet->getRowDimension(2);
self::assertEquals(100, $dimension->getRowHeight());
unlink($filename);
}
public function testCanApplyAlignment()
{
$html = '<table>
<tr>
<td align="center">Center align</td>
<td valign="center">Center valign</td>
<td style="text-align: center;">Center align</td>
<td style="vertical-align: center;">Center valign</td>
<td style="text-indent: 10px;">Text indent</td>
<td style="word-wrap: break-word;">Wraptext</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$style = $firstSheet->getCell('A1')->getStyle();
self::assertEquals(Alignment::HORIZONTAL_CENTER, $style->getAlignment()->getHorizontal());
$style = $firstSheet->getCell('B1')->getStyle();
self::assertEquals(Alignment::VERTICAL_CENTER, $style->getAlignment()->getVertical());
$style = $firstSheet->getCell('C1')->getStyle();
self::assertEquals(Alignment::HORIZONTAL_CENTER, $style->getAlignment()->getHorizontal());
$style = $firstSheet->getCell('D1')->getStyle();
self::assertEquals(Alignment::VERTICAL_CENTER, $style->getAlignment()->getVertical());
$style = $firstSheet->getCell('E1')->getStyle();
self::assertEquals(10, $style->getAlignment()->getIndent());
$style = $firstSheet->getCell('F1')->getStyle();
self::assertTrue($style->getAlignment()->getWrapText());
unlink($filename);
}
public function testCanApplyInlineDataFormat()
{
$html = '<table>
<tr>
<td data-format="mmm-yy">2019-02-02 12:34:00</td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
$style = $firstSheet->getCell('A1')->getStyle();
self::assertEquals('mmm-yy', $style->getNumberFormat()->getFormatCode());
unlink($filename);
}
public function testCanInsertImage()
{
$imagePath = realpath(__DIR__ . '/../../data/Reader/HTML/image.jpg');
$html = '<table>
<tr>
<td><img src="' . $imagePath . '" alt=""></td>
</tr>
</table>';
$filename = $this->createHtml($html);
$spreadsheet = $this->loadHtmlIntoSpreadsheet($filename);
$firstSheet = $spreadsheet->getSheet(0);
/** @var Drawing $drawing */
$drawing = $firstSheet->getDrawingCollection()[0];
self::assertEquals($imagePath, $drawing->getPath());
self::assertEquals('A1', $drawing->getCoordinates());
unlink($filename);
}
/**
* @param string $html
*
* @return string
*/
private function createHtml($html)
{
$filename = tempnam(sys_get_temp_dir(), 'html');
file_put_contents($filename, $html);
return $filename;
}
/**
* @param $filename
*
* @return \PhpOffice\PhpSpreadsheet\Spreadsheet
*/
private function loadHtmlIntoSpreadsheet($filename)
{
return (new Html())->load($filename);
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB