Merge pull request #104 from jtojnar/html-shadowing

Fix discarding `html[lang]`
pull/105/head 2.0.6
Jérémy Benoist 1 year ago committed by GitHub
commit 7413a38ff0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 43
      src/Readability.php
  2. 73
      tests/ReadabilityTest.php

@ -1422,7 +1422,7 @@ class Readability implements LoggerAwareInterface
unset($tidy); unset($tidy);
} }
$this->html = '<meta charset="utf-8">' . (string) $this->html; $this->html = self::ensureMetaCharset((string) $this->html);
if ('html5lib' === $this->parser || 'html5' === $this->parser) { if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html); $this->dom = (new HTML5())->loadHTML($this->html);
@ -1510,4 +1510,45 @@ class Readability implements LoggerAwareInterface
) )
&& !$node->hasAttribute('hidden'); && !$node->hasAttribute('hidden');
} }
/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset(string $html): string
{
$charsetTag = '<meta charset="utf-8">';
// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}
if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
}
} }

@ -24,7 +24,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability->init(); $readability->init();
$this->assertNull($readability->url); $this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf(\DOMDocument::class, $readability->dom);
} }
public function testConstructHtml5Parser(): void public function testConstructHtml5Parser(): void
@ -33,7 +33,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability->init(); $readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html); $this->assertSame('<html/>', $readability->original_html);
} }
@ -46,7 +46,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability->init(); $readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html); $this->assertSame('<html/>', $readability->original_html);
$this->assertTrue($readability->tidied); $this->assertTrue($readability->tidied);
} }
@ -60,7 +60,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertSame('', $readability->original_html); $this->assertSame('', $readability->original_html);
$this->assertFalse($readability->tidied); $this->assertFalse($readability->tidied);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf(\DOMDocument::class, $readability->dom);
} }
public function testConstructSimpleWithoutTidy(): void public function testConstructSimpleWithoutTidy(): void
@ -69,7 +69,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability->init(); $readability->init();
$this->assertSame('http://0.0.0.0', $readability->url); $this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom); $this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html); $this->assertSame('<html/>', $readability->original_html);
$this->assertFalse($readability->tidied); $this->assertFalse($readability->tidied);
} }
@ -115,7 +115,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testInitDiv(): void public function testInitDiv(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -129,7 +128,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithFootnotes(): void public function testWithFootnotes(): void
{ {
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true; $readability->convertLinksToFootnotes = true;
$res = $readability->init(); $res = $readability->init();
@ -146,7 +144,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testStandardClean(): void public function testStandardClean(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
@ -163,7 +160,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithIframe(): void public function testWithIframe(): void
{ {
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0'); $readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -178,7 +174,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithArticle(): void public function testWithArticle(): void
{ {
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -193,7 +188,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithAside(): void public function testWithAside(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -208,7 +202,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClasses(): void public function testWithClasses(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -223,7 +216,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClassesWithoutLightClean(): void public function testWithClassesWithoutLightClean(): void
{ {
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false; $readability->lightClean = false;
$res = $readability->init(); $res = $readability->init();
@ -239,7 +231,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithTd(): void public function testWithTd(): void
{ {
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0'); $readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -252,7 +243,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithSameClasses(): void public function testWithSameClasses(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -266,7 +256,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithScript(): void public function testWithScript(): void
{ {
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -280,7 +269,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitle(): void public function testTitle(): void
{ {
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -294,7 +282,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDash(): void public function testTitleWithDash(): void
{ {
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -308,7 +295,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDoubleDot(): void public function testTitleWithDoubleDot(): void
{ {
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -322,7 +308,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleTooShortUseH1(): void public function testTitleTooShortUseH1(): void
{ {
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0'); $readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -369,7 +354,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>'; </html>';
$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030'); $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -437,7 +421,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>'; </html>';
$readability = $this->getReadability($data, 'http://0.0.0.0'); $readability = $this->getReadability($data, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
@ -474,7 +457,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html'); $html = (string) file_get_contents('tests/fixtures/childNodeGoesNull.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true; $readability->convertLinksToFootnotes = true;
$res = $readability->init(); $res = $readability->init();
@ -487,7 +469,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/keepFootnotes.html'); $html = (string) file_get_contents('tests/fixtures/keepFootnotes.html');
$readability = $this->getReadability($html, 'http://0.0.0.0'); $readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -501,7 +482,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = (string) file_get_contents('tests/fixtures/wipedBody.html'); $html = (string) file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
$this->assertTrue($res); $this->assertTrue($res);
@ -540,7 +520,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testVisibleNode(string $content, bool $shouldBeVisible): void public function testVisibleNode(string $content, bool $shouldBeVisible): void
{ {
$readability = $this->getReadability($content, 'http://0.0.0.0'); $readability = $this->getReadability($content, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init(); $res = $readability->init();
if ($shouldBeVisible) { if ($shouldBeVisible) {
@ -550,6 +529,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
} }
} }
/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang(): array
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
],
];
}
/**
* @dataProvider dataForHtmlLang
*/
public function testHtmlLang(string $html, string $lang, bool $useTidy = true): void
{
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
$res = $readability->init();
$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
}
private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability
{ {
$readability = new Readability($html, $url, $parser, $useTidy); $readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save