if ('html5lib' === $this->parser || 'html5' === $this->parser) {
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
$this->dom = (new HTML5())->loadHTML($this->html);
$this->dom = (new HTML5())->loadHTML($this->html);
@ -1510,4 +1510,45 @@ class Readability implements LoggerAwareInterface
)
)
&& !$node->hasAttribute('hidden');
&& !$node->hasAttribute('hidden');
}
}
/**
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
*
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
*
* @param string $html UTF-8 encoded document
*/
private static function ensureMetaCharset(string $html): string
{
$charsetTag = '<metacharset="utf-8">';
// Only look at first 1024 bytes since, according to HTML5 specification,
// that’s where <meta> elements declaring a character encoding must be located.
@ -115,7 +115,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testInitDiv(): void
public function testInitDiv(): void
{
{
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -129,7 +128,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithFootnotes(): void
public function testWithFootnotes(): void
{
{
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->convertLinksToFootnotes = true;
$readability->convertLinksToFootnotes = true;
$res = $readability->init();
$res = $readability->init();
@ -146,7 +144,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testStandardClean(): void
public function testStandardClean(): void
{
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<ahref="#nofollow"rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<ahref="#nofollow"rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$readability->lightClean = false;
$res = $readability->init();
$res = $readability->init();
@ -163,7 +160,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithIframe(): void
public function testWithIframe(): void
{
{
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframesrc="http://youtube.com/test"href="#nofollow"rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframesrc="http://youtube.com/test"href="#nofollow"rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -178,7 +174,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithArticle(): void
public function testWithArticle(): void
{
{
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframesrc="http://youtube.com/test"href="#nofollow"rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframesrc="http://youtube.com/test"href="#nofollow"rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -193,7 +188,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithAside(): void
public function testWithAside(): void
{
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -208,7 +202,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClasses(): void
public function testWithClasses(): void
{
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<divstyle="display:none">' . str_repeat('<pclass="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<divstyle="display:none">' . str_repeat('<pclass="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -223,7 +216,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithClassesWithoutLightClean(): void
public function testWithClassesWithoutLightClean(): void
{
{
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<divstyle="display:none">' . str_repeat('<pclass="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <ahref="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<divstyle="display:none">' . str_repeat('<pclass="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$readability->lightClean = false;
$readability->lightClean = false;
$res = $readability->init();
$res = $readability->init();
@ -239,7 +231,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithTd(): void
public function testWithTd(): void
{
{
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -252,7 +243,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithSameClasses(): void
public function testWithSameClasses(): void
{
{
$readability = $this->getReadability('<articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<divclass="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<divclass="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -266,7 +256,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testWithScript(): void
public function testWithScript(): void
{
{
$readability = $this->getReadability('<articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>Thistextisalsoanawesometextandyoushouldknowthat!</script></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>Thistextisalsoanawesometextandyoushouldknowthat!</script></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -280,7 +269,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitle(): void
public function testTitle(): void
{
{
$readability = $this->getReadability('<title>this is my title</title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<title>this is my title</title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -294,7 +282,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDash(): void
public function testTitleWithDash(): void
{
{
$readability = $this->getReadability('<title> title2 - title3 </title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<title> title2 - title3 </title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -308,7 +295,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleWithDoubleDot(): void
public function testTitleWithDoubleDot(): void
{
{
$readability = $this->getReadability('<title> title2 : title3 </title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<title> title2 : title3 </title><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -322,7 +308,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase
public function testTitleTooShortUseH1(): void
public function testTitleTooShortUseH1(): void
{
{
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><articleclass="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
$readability->debug = true;
$res = $readability->init();
$res = $readability->init();
$this->assertTrue($res);
$this->assertTrue($res);
@ -369,7 +354,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase