From 235baf965c5c0675d44218cb86e4eb88de58ef9e Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 3 Mar 2025 23:26:30 +0100 Subject: [PATCH 1/4] Do not set domainRegExp for local files `parse_url($this->url, \PHP_URL_HOST)` will return `null` for local filesystem path. Casting it to `string` will produce an empty regular expression, which would match any link when computing link density. (cherry picked from commit c7208f6ad2febedce81a29dc276e15c42265fc1e) This also fixes a warning since 1.x passes the `null` directly to `preg_replace` instead of explicitly casting it to `string`. --- src/Readability.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Readability.php b/src/Readability.php index 1d2d2f5..207aeee 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1396,7 +1396,10 @@ class Readability implements LoggerAwareInterface $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { - $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/'; + $host = parse_url($this->url, \PHP_URL_HOST); + if (null !== $host) { + $this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', $host), ['.' => '\.']) . '/'; + } } mb_internal_encoding('UTF-8'); From eb6ca1a99b9c68f155eeb3b19ca7fff09c4a1d60 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 3 Mar 2025 23:53:33 +0100 Subject: [PATCH 2/4] tests: Use ::class for DOMDocument class name Also capitalize it properly. (cherry picked from commit 90869d877e0510966f66c87f77c64f04e10ef9c9) --- tests/ReadabilityTest.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index b190a8e..42f952a 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -19,7 +19,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability = $this->getReadability(''); $this->assertNull($readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); } public function testConstructHtml5Parser() @@ -27,7 +27,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability = $this->getReadability('', 'http://0.0.0.0', 'html5lib'); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); } @@ -39,7 +39,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability = $this->getReadability('', 'http://0.0.0.0'); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); $this->assertTrue($readability->tidied); } @@ -52,7 +52,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertSame('', $readability->original_html); $this->assertFalse($readability->tidied); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); } public function testConstructSimpleWithoutTidy() @@ -60,7 +60,7 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $readability = $this->getReadability('', 'http://0.0.0.0', 'libxml', false); $this->assertSame('http://0.0.0.0', $readability->url); - $this->assertInstanceOf('DomDocument', $readability->dom); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); $this->assertSame('', $readability->original_html); $this->assertFalse($readability->tidied); } From 5afefcff3424e3e48e82ef128664584e6d616f4c Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 3 Mar 2025 23:55:32 +0100 Subject: [PATCH 3/4] tests: Remove pointless debug assignment It is unused since 8ab7d76cd5209a69f3459db5f7bd578e5cbcc5e8. (cherry picked from commit 541fab34a0070400a6874f8a7d0777da20d58cdc) --- tests/ReadabilityTest.php | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 42f952a..12d5529 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -106,7 +106,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testInitDiv() { $readability = $this->getReadability('
' . str_repeat('This is the awesome content :)', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -120,7 +119,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithFootnotes() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -137,7 +135,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testStandardClean() { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . 'will NOT be removed
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -154,7 +151,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithIframe() { $readability = $this->getReadability('

Title

' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -169,7 +165,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithArticle() { $readability = $this->getReadability('

' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '

This is an awesome text with some links, here there are

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -184,7 +179,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithAside() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -199,7 +193,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithClasses() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -214,7 +207,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithClassesWithoutLightClean() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are: the awesome

', 7) . '
' . str_repeat('

This text should be removed

', 10) . '
', 'http://0.0.0.0'); - $readability->debug = true; $readability->lightClean = false; $res = $readability->init(); @@ -230,7 +222,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithTd() { $readability = $this->getReadability('' . str_repeat('', 7) . '

This is an awesome text with some links, here there are the awesome

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -243,7 +234,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithSameClasses() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '
This text is also an awesome text and you should know that !
', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -257,7 +247,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testWithScript() { $readability = $this->getReadability('
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -271,7 +260,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitle() { $readability = $this->getReadability('this is my title
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -285,7 +273,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleWithDash() { $readability = $this->getReadability(' title2 - title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -299,7 +286,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleWithDoubleDot() { $readability = $this->getReadability(' title2 : title3
' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -313,7 +299,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase public function testTitleTooShortUseH1() { $readability = $this->getReadability('too short

this is my h1 title !

' . str_repeat('

This is an awesome text with some links, here there are the awesome

', 7) . '

', 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -365,7 +350,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase '; $readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030'); - $readability->debug = true; $res = $readability->init(); @@ -433,7 +417,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase '; $readability = $this->getReadability($data, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); @@ -472,7 +455,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = file_get_contents('tests/fixtures/childNodeGoesNull.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $readability->convertLinksToFootnotes = true; $res = $readability->init(); @@ -485,7 +467,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = file_get_contents('tests/fixtures/keepFootnotes.html'); $readability = $this->getReadability($html, 'http://0.0.0.0'); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); @@ -499,7 +480,6 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $html = file_get_contents('tests/fixtures/wipedBody.html'); $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false); - $readability->debug = true; $res = $readability->init(); $this->assertTrue($res); From f1c6297e3c52693f4a8f69f3cb9676979176f52e Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Tue, 4 Mar 2025 01:25:47 +0100 Subject: [PATCH 4/4] Fix discarding `html[lang]` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. In f14428e4c0fa34b28f6b8a7696e62790bcde363e, we tried to resolve it by putting `meta[charset]` tag at the start of the HTML fragment. Unfortunately, it turns out that causes parser to auto-insert a `html` element, losing the attributes of the original `html` tag. Let’s try to insert the `meta[charset]` tag into the proper place in the HTML document. We do not need to use the same trick with `JSLikeHTMLElement::__set`. That expects smaller HTML fragments, not `html` documents, so creating `html` and `head` elements will not be a problem. (cherry picked from commit efbbc86df9716a3ab1ed8a351d9e8316f3a2aab0) Had to strip type hints since we still target PHP 5.6. --- src/Readability.php | 43 ++++++++++++++++++++++++++++++++++++++- tests/ReadabilityTest.php | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/Readability.php b/src/Readability.php index 207aeee..05a6c9f 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -1435,7 +1435,7 @@ class Readability implements LoggerAwareInterface unset($tidy); } - $this->html = '' . (string) $this->html; + $this->html = self::ensureMetaCharset((string) $this->html); if ('html5lib' === $this->parser || 'html5' === $this->parser) { $this->dom = (new HTML5())->loadHTML($this->html); @@ -1453,4 +1453,45 @@ class Readability implements LoggerAwareInterface $this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement'); } + + /** + * Tries to insert `meta[charset]` tag into the proper place in the passed HTML document. + * + * `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. + * This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. + * Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag. + * + * @param string $html UTF-8 encoded document + */ + private static function ensureMetaCharset($html) + { + $charsetTag = ''; + + // Only look at first 1024 bytes since, according to HTML5 specification, + // that’s where elements declaring a character encoding must be located. + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset + $start = substr($html, 0, 1000); + + if (1 === preg_match('/]+charset/i', $start)) { + // tag is already present, no need for modification. + return $html; + } + + if (1 === preg_match('/]*>/i', $start)) { + // tag was located, tags go there. + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + if (1 === preg_match('/]*>/i', $start)) { + // tag was located, let’s put it inside and have parser create . + $html = preg_replace('/]*>/i', '$0' . $charsetTag, $html, 1); + + return $html; + } + + // Fallback – just plop the at the start of the fragment. + return $charsetTag . $html; + } } diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index 12d5529..fc9f6d5 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -486,6 +486,48 @@ class ReadabilityTest extends \PHPUnit\Framework\TestCase $this->assertStringContainsString('Down the Rabbit-Hole', $readability->getContent()->getInnerHtml()); } + /** + * @return array + */ + public function dataForHtmlLang() + { + return [ + 'meta' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + ], + 'head' => [ + 'Foo
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + ], + 'headless' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + 'fr', + // tidy would add tag. + false, + ], + 'fragment' => [ + '
' . str_repeat('

This is the awesome content :)

', 7) . '
', + '', + // tidy would add . + false, + ], + ]; + } + + /** + * @dataProvider dataForHtmlLang + */ + public function testHtmlLang($html, $lang, $useTidy = true) + { + $readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy); + $res = $readability->init(); + + $this->assertTrue($res); + $this->assertInstanceOf(\DOMDocument::class, $readability->dom); + $this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang')); + } + private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true) { $readability = new Readability($html, $url, $parser, $useTidy);