Merge pull request #103 from jtojnar/backports-local-no-domain

[1.x] Backport parser_url + html[lang] fixes
pull/107/head 1.2.12
Jérémy Benoist 1 year ago committed by GitHub
commit 109a22662d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 48
      src/Readability.php
  2. 72
      tests/ReadabilityTest.php

@ -1396,7 +1396,10 @@ class Readability implements LoggerAwareInterface
$this->logger->debug('Parsing URL: ' . $this->url);
if ($this->url) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
$host = parse_url($this->url, \PHP_URL_HOST);
if (null !== $host) {
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', $host), ['.' => '\.']) . '/';
}
}
mb_internal_encoding('UTF-8');
@ -1432,7 +1435,7 @@ class Readability implements LoggerAwareInterface
unset($tidy);
}
$this->html = '<meta charset="utf-8">' . (string) $this->html;
$this->html = self::ensureMetaCharset((string) $this->html);
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
@ -1450,4 +1453,45 @@ class Readability implements LoggerAwareInterface
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
}
/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset($html)
{
$charsetTag = '<meta charset="utf-8">';
// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
$start = substr($html, 0, 1000);
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
// <meta> tag is already present, no need for modification.
return $html;
}
if (1 === preg_match('/<head[^>]*>/i', $start)) {
// <head> tag was located, <meta> tags go there.
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
if (1 === preg_match('/<html[^>]*>/i', $start)) {
// <html> tag was located, let’s put it inside and have parser create <head>.
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
return $html;
}
// Fallback – just plop the <meta> at the start of the fragment.
return $charsetTag . $html;
}
}

@ -19,7 +19,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = $this->getReadability('');
$this->assertNull($readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
}
public function testConstructHtml5Parser()
@ -27,7 +27,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
}
@ -39,7 +39,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
$this->assertTrue($readability->tidied);
}
@ -52,7 +52,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$this->assertSame('', $readability->original_html);
$this->assertFalse($readability->tidied);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
}
public function testConstructSimpleWithoutTidy()
@ -60,7 +60,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
$this->assertSame('http://0.0.0.0', $readability->url);
$this->assertInstanceOf('DomDocument', $readability->dom);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame('<html/>', $readability->original_html);
$this->assertFalse($readability->tidied);
}
@ -106,7 +106,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testInitDiv()
{
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -120,7 +119,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithFootnotes()
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true;
$res = $readability->init();
@ -137,7 +135,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testStandardClean()
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$res = $readability->init();
@ -154,7 +151,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithIframe()
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -169,7 +165,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithArticle()
{
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -184,7 +179,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithAside()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -199,7 +193,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClasses()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -214,7 +207,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClassesWithoutLightClean()
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$res = $readability->init();
@ -230,7 +222,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithTd()
{
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -243,7 +234,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithSameClasses()
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -257,7 +247,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithScript()
{
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -271,7 +260,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitle()
{
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -285,7 +273,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDash()
{
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -299,7 +286,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDoubleDot()
{
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -313,7 +299,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleTooShortUseH1()
{
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -365,7 +350,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>';
$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
$readability->debug = true;
$res = $readability->init();
@ -433,7 +417,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
</html>';
$readability = $this->getReadability($data, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
@ -472,7 +455,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = file_get_contents('tests/fixtures/childNodeGoesNull.html');
$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true;
$res = $readability->init();
@ -485,7 +467,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = file_get_contents('tests/fixtures/keepFootnotes.html');
$readability = $this->getReadability($html, 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
@ -499,13 +480,54 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
$html = file_get_contents('tests/fixtures/wipedBody.html');
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
$readability->debug = true;
$res = $readability->init();
$this->assertTrue($res);
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
}
/**
* @return array<string, array{0: string, 1: string, 2?: bool}>
*/
public function dataForHtmlLang()
{
return [
'meta' => [
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'head' => [
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
],
'headless' => [
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
'fr',
// tidy would add <head> tag.
false,
],
'fragment' => [
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
'',
// tidy would add <html>.
false,
],
];
}
/**
* @dataProvider dataForHtmlLang
*/
public function testHtmlLang($html, $lang, $useTidy = true)
{
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
$res = $readability->init();
$this->assertTrue($res);
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
}
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
{
$readability = new Readability($html, $url, $parser, $useTidy);

Loading…
Cancel
Save