Fix discarding `html[lang]`

`DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.

In f14428e4c0, we tried to resolve it by putting `meta[charset]` tag at the start of the HTML fragment. Unfortunately, it turns out that causes parser to auto-insert a `html` element, losing the attributes of the original `html` tag.

Let’s try to insert the `meta[charset]` tag into the proper place in the HTML document.

We do not need to use the same trick with `JSLikeHTMLElement::__set`.
That expects smaller HTML fragments, not `html` documents, so creating `html` and `head` elements will not be a problem.
pull/104/head
Jan Tojnar 1 year ago
parent 541fab34a0
commit efbbc86df9
  1. 43
      src/Readability.php
  2. 42
      tests/ReadabilityTest.php

@ -1419,7 +1419,7 @@ class Readability implements LoggerAwareInterface
unset($tidy);
}
$this->html = '<meta charset="utf-8">' . (string) $this->html;
$this->html = self::ensureMetaCharset((string) $this->html);
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
@ -1507,4 +1507,45 @@ class Readability implements LoggerAwareInterface
)
&& !$node->hasAttribute('hidden');
}
/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset(string $html): string
{
$charsetTag = '<meta charset="utf-8">';
// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}
if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
}
}

@ -529,6 +529,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
}
}
/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang(): array
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
],
];
}
/**
* @dataProvider dataForHtmlLang
*/
public function testHtmlLang(string $html, string $lang, bool $useTidy = true): void
{
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
$res = $readability->init();
$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
}
private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{
$readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save