Fix discarding `html[lang]`

`DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.

In f14428e4c0, we tried to resolve it by putting `meta[charset]` tag at the start of the HTML fragment. Unfortunately, it turns out that causes parser to auto-insert a `html` element, losing the attributes of the original `html` tag.

Let’s try to insert the `meta[charset]` tag into the proper place in the HTML document.

We do not need to use the same trick with `JSLikeHTMLElement::__set`.
That expects smaller HTML fragments, not `html` documents, so creating `html` and `head` elements will not be a problem.

(cherry picked from commit efbbc86df9)

Had to strip type hints since we still target PHP 5.6.
pull/103/head
Jan Tojnar 1 year ago
parent 5afefcff34
commit f1c6297e3c
  1. 43
      src/Readability.php
  2. 42
      tests/ReadabilityTest.php

@ -1435,7 +1435,7 @@ class Readability implements LoggerAwareInterface
unset($tidy);
}
$this->html = '<meta charset="utf-8">' . (string) $this->html;
$this->html = self::ensureMetaCharset((string) $this->html);
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
@ -1453,4 +1453,45 @@ class Readability implements LoggerAwareInterface
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset($html)
{
$charsetTag = '<meta charset="utf-8">';
// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}
if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
}
}

@ -486,6 +486,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}
/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang()
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
],
];
}
/**
* @dataProvider dataForHtmlLang
*/
public function testHtmlLang($html, $lang, $useTidy = true)
{
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
$res = $readability->init();
$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
}
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
{
$readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save